def get_utterances(utterances, line, category, wgram, cgram):
    tknzr = TweetTokenizer()
    gram_list = []
    # WORD GRAMS
    if wgram == 1:  # unigram
        wgram_list = tknzr.tokenize(line)
    elif wgram == 2:  # uni + bigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, score in scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list
    elif wgram == 3: # uni + bi + trigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        bi_finder = BigramCollocationFinder.from_words(tokens)
        bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, biscore in bi_scored)  
        # trigram list
        tri_finder = TrigramCollocationFinder.from_words(tokens)
        tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq)
        trigram_list = sorted(trigram for trigram, triscore in tri_scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list
    
    # CHAR GRAMS
    cgram_list = []
    if cgram == 1:   # uni-chargram
        cgram_list = [line[i:i+1] for i in range(len(line)-1)]
    elif cgram == 2: # bi-chargram
        cgram_list = [line[i:i+2] for i in range(len(line)-1)]
    elif cgram == 3: # tri-chargram
        cgram_list = [line[i:i+3] for i in range(len(line)-1)]
        
    # RESULT
    if category == 'QA':            # non-task
        utterances.append((wgram_list + cgram_list, 0))
    elif category == 'Shopping':    # task
        utterances.append((wgram_list + cgram_list, 1))
    elif category == 'Travel':      # task
        utterances.append((wgram_list + cgram_list, 2))
    elif category == 'Hotel':       # task
        utterances.append((wgram_list + cgram_list, 3))
    elif category == 'Food':        # task
        utterances.append((wgram_list + cgram_list, 4))
    elif category == 'Art':         # task
        utterances.append((wgram_list + cgram_list, 5))
    elif category == 'Weather':     # task
        utterances.append((wgram_list + cgram_list, 6))
    elif category == 'Friends':     # task
        utterances.append((wgram_list + cgram_list, 7))
    elif category == 'Chat':        # chat
        utterances.append((wgram_list + cgram_list, 8))
    else:
        print utt_category,"ERROR"
def load_data_and_labels_semeval():
    # load the entire semeval dataset
    old_dataset = list(open("./input/2013-dev"))
    old_dataset.extend(list(open("./input/2013-devtest")))
    old_dataset.extend(list(open("./input/2013-train")))
    old_dataset.extend(list(open("./input/2014-devtest")))

    new_dataset = list(open("./input/2016-train"))
    new_dataset.extend(list(open("./input/2016-dev")))
    new_dataset.extend(list(open("./input/2016-devtest")))

    # filter out invalid tweets from new dataset
    new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']

    # generate x from old
    tk = TweetTokenizer(reduce_len=True) # handles punctuations
    x_text = [entry.split('\t')[3] for entry in old_dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate x from new
    x_text_new = [entry.split('\t')[2] for entry in new_dataset]
    x_text_new = [clean_str(tweet) for tweet in x_text_new]
    x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]

    # concat x and x_new
    x_text.extend(x_text_new)

    # generate y from old
    y = [entry.split('\t')[2] for entry in old_dataset]
    for idx, label in enumerate(y):
        if label == 'positive':
            y[idx] = [1, 0, 0]
        elif label == 'neutral':
            y[idx] = [0, 1, 0]
        elif label == 'negative':
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # generate y from new
    y_new = [entry.split('\t')[1] for entry in new_dataset]
    for idx, label in enumerate(y_new):
        if label == 'positive':
            y_new[idx] = [1, 0, 0]
        elif label == 'neutral':
            y_new[idx] = [0, 1, 0]
        elif label == 'negative':
            y_new[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # concat y and y_new
    y.extend(y_new)

    return [x_text, y]
def custom_tokenizer(text, bigrams = None):
    chunks = text.split('-')
    tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False)
    tokens = tokenizer.tokenize(text)
    tokens = [ subchunk for chunk in chunks for subchunk in tokenizer.tokenize(chunk) ]
    tokens = [ token for token in tokens if token.isalpha() ]
    if bigrams:
        tokens = mwe_tokenize(tokens, bigrams)
    stemmer = SnowballStemmer('english', ignore_stopwords=True)
    tokens = [ stemmer.stem(token) for token in tokens ]
    return tokens
def getVocab():
	freq = []
	vocab = []
	length = 0
	tknzr = TweetTokenizer()
	with open(path+'/data/training/training_stances.csv', 'r', encoding='UTF-8') as csvDataFile: 
		csvReader = csv.reader(csvDataFile)
		first = 1
		for row in csvReader:
			if first == 1:
				first = 0
			else:
				headline = row[0]
				tokens = tknzr.tokenize(headline)
				tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)]
				#for word in r.split(headline):
				length = length + len(tokens)
				for word in tokens:
					if word not in vocab:
						vocab.append(word)
						freq.append(1)
					else:
						ind = vocab.index(word)
						freq[ind] = freq[ind] + 1
				
	with open(path+'/data/training/train_bodies.csv', 'r', encoding='UTF-8') as csvDataFile: 
		csvReader = csv.reader(csvDataFile)
		first = 1
		for row in csvReader:
			if first == 1:
				first = 0
			else:
				body = row[1]
				tokens = tknzr.tokenize(body)
				tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)]
				length = length + len(tokens)
				#for word in r.split(headline):
				for word in tokens:
					if word not in vocab:
						vocab.append(word)
						freq.append(1)
					else:
						ind = vocab.index(word)
						freq[ind] = freq[ind] + 1
	return vocab, freq, length


				
#vocab list
#vocab, freq, length = getVocab()
def get_classifier(featx):
    tokenizer = TweetTokenizer()
    print "Training Classifier..."
    negstr = [obj["text"] for obj in handle.negative_tweets.find()]
    posstr = [obj["text"] for obj in handle.positive_tweets.find()]
    negfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(negstr[i]))), 'neg')
                for i in range(0, len(negstr)-1)]
    posfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(posstr[i]))), 'pos')
                for i in range(0, len(posstr)-1)]
    trainfeats = negfeats + posfeats

    classifier = NaiveBayesClassifier.train(trainfeats)

    return classifier
def get_features(utterances, ngram, classify_method):
    features = []
    tknzr = TweetTokenizer()
    for utt in utterances:
        utt_content = utt[0]  # text content of the utterance
        utt_category = utt[1]

        if ngram:  # use bow & ngram as feature
            # bow list
            bow_list = tknzr.tokenize(utt_content)
            # cgram list
            uni_cgram_list = [utt_content[i:i+1] for i in range(len(utt_content)-1)]
            bi_cgram_list = [utt_content[i:i+2] for i in range(len(utt_content)-1)]
            tri_cgram_list = [utt_content[i:i+3] for i in range(len(utt_content)-1)]
            feature_list = bow_list         # add bow tokens
            feature_list += uni_cgram_list  # add unigram character lists
            feature_list += bi_cgram_list   # add bigram character lists
            feature_list += tri_cgram_list  # add trigram character lists
        else:  # only use bow as feature
            feature_list = tknzr.tokenize(utt_content)

        if classify_method == 'binary':
            if utt_category == 'QA':  # non-task
                features.append((feature_list, 0))
            else:  # task
                features.append((feature_list, 1))
        elif classify_method == 'multi':
            if utt_category == 'QA':            # non-task
                features.append((feature_list, 0))
            elif utt_category == 'Shopping':    # task
                features.append((feature_list, 1))
            elif utt_category == 'Travel':      # task
                features.append((feature_list, 2))
            elif utt_category == 'Hotel':       # task
                features.append((feature_list, 3))
            elif utt_category == 'Food':        # task
                features.append((feature_list, 4))
            elif utt_category == 'Art':         # task
                features.append((feature_list, 5))
            elif utt_category == 'Weather':     # task
                features.append((feature_list, 6))
            elif utt_category == 'Friends':     # task
                features.append((feature_list, 7))
            elif utt_category == 'Chat':        # chat
                features.append((feature_list, 8))
            else:
                print utt_category,"ERROR"

    return features
def get_test(infile, NUM_TEST):
	with codecs.open(infile, 'rb') as csvfile:
		test = []
		pos_tweets = 0
		neg_tweets = 0
		reader = csv.reader(csvfile)
		tokenizer = TweetTokenizer(preserve_case=True)
		for line in reader:
			if line[0] == "0":
				sent="Negative"
				neg_tweets+=1

				if neg_tweets < NUM_TEST:
					text = tokenizer.tokenize(line[5].decode("utf-8"))
					for i,token in enumerate(text):
						text[i] = re.sub("@[\S]+", "USERNAME", text[i])
						text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i])
						newstr = ""
						for ch in text[i]:
							if ord(ch)>128:
								newstr+= "EMOJI_{0}".format(ord(ch))
								#print [ch], ord(ch)
							else:
								newstr+=(ch)
						text[i] = newstr
					test.append((text, sent))

		
			if line[0] == "4":
				sent = "Positive"
				pos_tweets+=1
				
				if pos_tweets < NUM_TEST:			
					text = tokenizer.tokenize(line[5].decode("utf-8"))
					for i,token in enumerate(text):
						text[i] = re.sub("@[\S]+", "USERNAME", text[i])
						text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i])
						newstr = ""
						for ch in text[i]:
							if ord(ch)>128:
								newstr+= "EMOJI_{0}".format(ord(ch))
								#print [ch], ord(ch)
							else:
								newstr+=(ch)
						text[i] = newstr
					test.append((text, sent))
			

		return test
Exemple #8
0
    def _get_nouns(tweet_text):
        """

        Args:
            tweet_text:

        Returns:

        """
        tokenizer = TweetTokenizer()
        tokenizer.tokenize(tweet_text)
        nouns = []
        tag = pos_tag(tokenizer.tokenize(tweet_text))
        nouns.extend([t[0] for t in tag if t[1] == 'NN' or t[1] == 'NNP'])
        return nouns
Exemple #9
0
def get_diff(query, event_name):
    tknzr = TweetTokenizer()
    query_strip = tknzr.tokenize(query)
    name_strip = tknzr.tokenize(event_name)
    ratio = 0
    for word in query_strip:
        for word2 in name_strip:
            r = difflib.SequenceMatcher(None, word, word2).ratio()
            rrr = r*r*r
            ratio += rrr
    if ratio >= len(query_strip):
        # werk om eoa reden niet
        print ratio ,len(name_strip)
        ratio = 100
    return ratio
Exemple #10
0
 def _tag_text(self, tweet_text):
     tokenizer = TweetTokenizer()
     tokens = tokenizer.tokenize(tweet_text)
     tagged = nltk.pos_tag(tokens)
     entities = nltk.chunk.ne_chunk(tagged)
     neList = traverse(entities)
     return neList
def load_data_and_labels_sam():
    # load
    with open("./input/2780_freshmen_tweets.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    # filter out tweets with unknown sentiment
    dataset = [entry for entry in dataset if entry[4] != '0']

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[3] for entry in dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate y
    y = [entry[4] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1': # positive
            y[idx] = [1, 0, 0]
        elif label == '2': # neutral
            y[idx] = [0, 1, 0]
        elif label == '3': # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in sam: ' + label

    return [x_text, y]
def load_tweetkeywords():
    """
    Check and see which keywords are used in each tweet, and load the association
    table linking tweets and keywords
    """

    # TweetKeyword.query.delete()

    tweets = Tweet.query.all()
    keyword_query = Keyword.query.all()
    keywords = []
    [keywords.append(word.keyword) for word in keyword_query]

    tknzr = TweetTokenizer()

    for tweet in tweets:
        tokenized_tweets = tknzr.tokenize(tweet.text)

        for token in tokenized_tweets:
            if token in keywords:
                tweet_id = Tweet.query.filter(Tweet.tweet_id == tweet.tweet_id).one()
                keyword_id = Keyword.query.filter(Keyword.keyword == token).one()
                tweet_keyword = TweetKeyword(keyword_id=keyword_id.keyword_id, tweet_id=tweet_id.tweet_id)
                print "Added to TweetKeyword table: {}".format(tweet_keyword.keyword_id)
                db.session.add(tweet_keyword)

    db.session.commit()
Exemple #13
0
def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged
def process_tweets(file_name):
    '''
    Person Responsible: Devin Munger

    + file_name: filename of tweets as returned from API based on query
   
    Extract text from file; return dataframe with tweet text, id
    '''
    ## Create empty dataframe
    tweet_df = pd.DataFrame(columns = ["text", "id"])

    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True)
    ## Read each JSON from file
    with open(file_name) as data_file:
        for entry in data_file.readlines():
            tweet = json.loads(entry)
            tweet_id = str(tweet.get("id", ""))
            text = tweet.get("text", "")
            ## Remove links from text
            text = re.sub(r"http\S+", "", text)
            ## Remove twitter keywords
            text.replace("RT ", "")
            ## Remove handle, punctuation from tweet text
            text_words = filter(lambda x: x not in string.punctuation, tokenizer.tokenize(text))
            ## Add tweet to dataframe
            tweet_df.loc[len(tweet_df)] = [" ".join(text_words), tweet_id]
    return tweet_df
def load_csv():
    with open('Tweets.csv', 'rb') as csvfile:
        reader = csv.DictReader(csvfile)
        count = 1

        reviews = []
        stars = []
        tknzr = TweetTokenizer()
        for row in reader:
            try:
                words=tknzr.tokenize(row['text'])
                label = 'SENT_%s' % count

                #print label
               # TaggedDocument(utils.to_unicode(row['text']).split(), [label])
                # print "label:", label
                #labels = [label]
                #lab_sent = LabeledSentence(words, label)
                #print lab_sent
                #reviews.append(TaggedDocument(utils.to_unicode(row['text']).split(), [label]))
                reviews.append(TaggedDocument(words, [label]))
                stars.append(row['airline_sentiment'])
                count += 1
            except:
                continue

    print "final count:", count
    return reviews, stars
Exemple #16
0
def nltk_tokenize(text):
    tokens = []

    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)

    return tokens
def reasoning(dList):
	reasonList = []
	tokenizer = TweetTokenizer()
	for tweet in dList:
		print tweet
		# tokenize
		words = tokenizer.tokenize(tweet)
		# get POS tag
		pos_tokens = pos_tag(words)
		# get name entities
		tree = ne_chunk(pos_tokens, binary = False)
		# find relations
		pairs = relextract.tree2semi_rel(tree)
		# get interesting name entities
		reason = []
		for s, tree in pairs:
			reasonStr = ("%s") % tree
			reasonStr = reasonStr.split(" ")
			label = reasonStr[0].replace("(","").strip()
			content = ""
			for wordTag in reasonStr[1:]:
				sp = wordTag.split("/")
				word = sp[0].replace("(","")
				print word
				# content.append(word)
				content += (word + " ")
			# reason: [(label, content)]
			reason.append({"label": label, "content": content})
		# reasonList [reason]
		if len(reason) > 0:
			reasonList.append({"reason": reason})
		print str(len(reasonList)) + "/" + str(len(dList))
	return reasonList
Exemple #18
0
def createDataset(filename, MAX_VOCAB_SIZE):
    yaks = []
    tokenizer = TweetTokenizer()
    ids = set()
    numyaks = 0
    for line in open(filename).readlines():
        stuff = line.split(":::")
        id = stuff[0]
        if len(stuff) > 3 and id not in ids:
            numyaks+=1
            sentence = stuff[3]
            ids.add(id)
            tokens = [START_TOKEN]
            tokens.extend(tokenizer.tokenize(sentence.lower()))
            tokens.append(END_TOKEN)
            yaks.append(tokens)
    token_frequency = nltk.FreqDist(itertools.chain(*yaks))
    vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1)
    i2t = [token[0] for token in vocab]
    i2t.append(UNKNOWN_TOKEN)
    t2i = dict()
    for i,t in enumerate(i2t):
        t2i[t] = i
    
    yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks]
    
    Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks])
    Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks])
    print "Num unique Yaks: "+str(numyaks)
    return (Xtrain, Ytrain, i2t, t2i)
Exemple #19
0
def main():
    text = sys.stdin.read().decode("utf-8")

    tknzr = TweetTokenizer()
    tok = tknzr.tokenize(text)
    saved_object = construct_dict(tok)
    print json.dumps(saved_object)
def keywords_search(reviews):
    key_map = {}
    # for k in open(os.getcwd() + "/KeyWord/keyword_map_general.txt", 'r'):
    for k in open(keyword_general_path, 'r'):
        a = k.strip().split(", ")
        key_map[a[0]] = a[1]

    special_map = {}
    # for k in open(os.getcwd() + "/KeyWord/keyword_map_special.txt", 'r'):
    for k in open(keyword_special_path, 'r'):
        a = k.strip().split(", ")
        special_map[a[0]] = a[1]

    raw = reviews.lower()
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(raw)

    # remove punctuations
    no_punc_tokens = [i for i in tokens if (not i in string.punctuation+string.digits) and (not "." in i)]

    # remove stop words from tokens
    en_stop = get_stop_words('en')
    stopped_tokens = [i for i in no_punc_tokens if not i in en_stop]

    # stem tokens
    # wordnet_lemmatizer = WordNetLemmatizer()
    # stemmed_tokens = [wordnet_lemmatizer.lemmatize(i) for i in stopped_tokens ] 

    chosen_key_words = []

    # Search in general key word
    key_words_dict = dict.fromkeys(key_map.values(), 0)

    # Select keyword use only key word to select
    # s = set(stemmed_tokens)
    s = set(stopped_tokens)
    for t in key_map.keys():
        if t in s:
            key_words_dict[key_map[t]] += 1

    for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    # Search in special keyword
    special_words_dict = dict.fromkeys(special_map.values(), 0)
    #  Select keyword using wordnet

    # Select keyword use only key word to select
    # s = set(stemmed_tokens)
    s = set(stopped_tokens)
    for t in special_map.keys():
        if t in s:
            special_words_dict[special_map[t]] += 1

    for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    return ' '.join(chosen_key_words)
Exemple #21
0
    def parse(self, text):

        # Tokenize message
        tokenizer = TweetTokenizer()
        words = tokenizer.tokenize(text)

        retweet_term = 'RT'

        urls = []
        users = []
        hash_tags = []
        for word in words:
            if (word[0] == '@'):
                # user in Twitter
                users.append(word)
            elif (word[0] == '#'):
                # hash tags
                hash_tags.append(word)
            elif (word.find('http:') == 0 or word.find('https:') == 0):
                # url
                urls.append(word)

        for f in urls + users + hash_tags + [retweet_term]:
            if f in words:
                words.remove(f)

        self.words = words
        self.urls = urls
        self.users = users
        self.hash_tags = hash_tags
def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X
def getTweetTokens(classification, toRead, info, tags):
    i=0
    tknzr = TweetTokenizer()

    with open(toRead) as f:
        content = f.readlines()

    c = 0

    for item in content:
        #adapt the list into python dictionary format
        content[c] = item.replace("null", "None")
        content[c] = content[c].replace("false", "False")
        content[c] = content[c].replace("true", "True")
        c+=1

    for i in range(len(content)):
        tweet = eval(content[i])["text"]
        tokenTweet = tknzr.tokenize(tweet)
        j = 0
        k = 0
        while j < (len(tokenTweet) - k):
            #print j
            if tokenTweet[j][0] == "#":
                tokenTweet[j] = tokenTweet[j][1:]
            elif tokenTweet[j][0] == "@":
                del tokenTweet[j]
                j-=1
                k+=1
            j+=1
            
        info.append((word_feats(tokenTweet), classification))
Exemple #24
0
def check():
	check_id = request.args.get("id")
	if check_id is not None:
		check_sentence = Sentence.query.get(check_id)
		if check_sentence is not None:
			Word.query.filter_by(sentence_id=check_id).delete()
			tweet_tokenizer = TweetTokenizer()
			tokens = tweet_tokenizer.tokenize(check_sentence.text)
			for token in tokens:
				url = "http://kateglo.com/api.php?format=json&phrase="+token
				resp = requests.get(url)
				exist = False
				if (resp.ok):
					try:
						resp_json = json.loads(resp.content)
						exist = True
					except ValueError:
						exist = False
				word = Word(check_sentence.id, token, exist)
				db.session.add(word)
			db.session.commit()
	sentences = Sentence.query.all()
	c = ((sentence.id, 
		sentence.source, 
		sentence.text, 
		((w.word, w.exist,) for w in sentence.words.all()), 
		) for sentence in sentences)
	return render_template('check.html', rows=c)
def preprocess_db():
    tkn = TweetTokenizer()
    photos = pd.read_pickle(r'./data/restaurant_photos_with_labels.pkl')
    img_path = r'./data/restaurant_photos/'
    sentid = 1
    img_list = []

    # Split data in such a way that labels are evenly distributed between 6 folds
    skf = StratifiedKFold(photos['label'], n_folds=6)

    folds = []
    # Initialize all images to train dataset initially
    photos['split'] = ['train' for i in range(len(photos))]

    # Obtain the indices for the test and validation splits and change value appropriately
    for _, test_ix in skf:
        folds.append(test_ix)
    photos.split[folds[0]] = 'test'
    photos.split[folds[1]] = 'val'

    # Obtain the information from each picture and move the pictures to the appropriate dir. The images are renamed.
    for i, photo_id in enumerate(photos.photo_id):
        img_dict = dict()
        img_dict['sentids'] = [sentid]
        img_dict['business_id'] = photo_id.business_id[i]
        if photos.split[i] in ['train']:
            img_dict['filepath'] = u'train'
            img_dict['imgid'] = 0
            img_dict['split'] = u'train'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/train/' + str(sentid).zfill(6) + '.jpg')
        elif photos.split[i] in ['test']:
            img_dict['filepath'] = u'test'
            img_dict['imgid'] = 0
            img_dict['split'] = u'test'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/test/' + str(sentid).zfill(6) + '.jpg')
        else:
            img_dict['filepath'] = u'val'
            img_dict['imgid'] = 0
            img_dict['split'] = u'val'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/val/' + str(sentid).zfill(6) + '.jpg')
        img_dict['label'] = photos.label[i]
        caption_dict = dict()
        if photos.caption[i]:
            # Tokenize the captions
            caption_dict['tokens'] = tkn.tokenize(photos.caption[i])
            caption_dict['raw'] = photos.caption[i]
        else:
            caption_dict['tokens'] = 'None'
            caption_dict['raw'] = 'None'
        caption_dict['imgid'] = 0
        caption_dict['sentid'] = sentid
        img_dict['sentences'] = [caption_dict]
        img_dict['photoid'] = sentid
        img_dict['yelpid'] = photo_id
        img_list.append(img_dict)
        sentid += 1

    # Store the new dataset as a JSON file
    with open("./data/image_caption_dataset.json", "w") as outfile:
        json.dump(img_list, outfile)
def load_data_and_labels_gameforum():
    # load
    with open("./input/gameforum-1000.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')]

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[0] for entry in dataset]
    x_text = [clean_str(post) for post in x_text]
    x_text = [tk.tokenize(post) for post in x_text]

    # generate y
    y = [entry[1] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1':  # positive
            y[idx] = [1, 0, 0]
        elif label == '2':  # neutral
            y[idx] = [0, 1, 0]
        elif label == '3':  # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in gameforum: ' + label

    return [x_text, y]
def preprocess_tweets(event_date, dt=datetime.timedelta(seconds=30),
                      match=None, tweet_processor=None, match_type='home'):
    import collections
    
    tknzr = TweetTokenizer()
    
    dbname = match['dbname']
    collname_home = match['collname_home']
    collname_away = match['collname_away']
    home_team = match['home_team']
    away_team = match['away_team']
    
    if match_type == 'home':
        coll = client[dbname][collname_home]
    else:
        coll = client[dbname][collname_away]

    # add some padding to the start and end times
    date_start = event_date - dt
    date_end = event_date + dt

    query = { "created_at": {"$gt": date_start, "$lt": date_end}}
    
    results = coll.find( query )
    clean_tweets = []
    for result in results:
        tweet_id = result['id_str']
        tweet_split = tweet_processor.preprocess(result['text'].encode('ascii', 'ignore'))
        
        parts = tknzr.tokenize(tweet_split)
        clean = [i for i in parts if i not in stop]
        clean_text = " ".join (clean)
        clean_tweets.append( (clean_text, tweet_id) )
        
    return clean_tweets
Exemple #28
0
def get_best_words():
    tokenizer = TweetTokenizer()
    # Analyze frequencies
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    negstr = [obj["text"] for obj in handle.negative_tweets.find()]
    posstr = [obj["text"] for obj in handle.positive_tweets.find()]
    negwords = []
    poswords = []
    for i in range(0, len(negstr)-1):
        for w in tokenizer.tokenize(Twitter.process_tweet(negstr[i])):
            if w not in stopwords.words("english"):
                negwords.append(w)
    for i in range(0, len(posstr)-1):
        for w in tokenizer.tokenize(Twitter.process_tweet(posstr[i])):
            if w not in stopwords.words("english"):
                poswords.append(w)

    for word in poswords:
        word_fd[word] += 1
        label_word_fd['pos'][word] += 1
    for word in negwords:
        word_fd[word] += 1
        label_word_fd['neg'][word] += 1
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    # Score words
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(
                            label_word_fd['pos'][word], (freq, pos_word_count),
                            total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(
                            label_word_fd['neg'][word], (freq, neg_word_count),
                            total_word_count)
        word_scores[word] = pos_score + neg_score

    # Keep best 10000 words
    best = sorted(
        word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:10000]
    bestwords = set([w for w, s in best])

    return bestwords
def bothTwitterAndMovie():
    tknzr = TweetTokenizer(strip_handles=True)
    onlyWords = re.compile('^[a-zA-Z]+$')

    f = open('movieTwitter_semtiment_classifier.pickle', 'rb')
    classifier = pickle.load(f)  # type: nltk.classify.naivebayes.NaiveBayesClassifier
    f.close()
    # text,created_at
    tweets = []

    onlyWords = re.compile('^[a-zA-Z]+$')
    labeledTweets = []
    for row in csv.DictReader(open('datafiles/trump.csv')):
        text = row['text']
        features = []
        for token in tknzr.tokenize(text):
            if onlyWords.match(token) is not None:
                features.append(token.lower())
        print row['created_at']
        tweets.append({
            "created_at": row['created_at'],
            "text": text,
            "classification": classifier.classify(word_feats(features))
        })
    classification = open('trumpClassified_both.json', 'w+')
    classification.write(json.dumps(tweets, indent=2))
    classification.close()

    tweets = []
    labeledTweets = []
    for row in csv.DictReader(open('datafiles/clinton.csv')):
        text = row['text']
        features = []
        for token in tknzr.tokenize(text):
            if onlyWords.match(token) is not None:
                features.append(token.lower())
        print row['created_at']
        tweets.append({
            "created_at": row['created_at'],
            "text": text,
            "classification": classifier.classify(word_feats(features))
        })
    classification = open('clintonClassified_both.json', 'w+')
    classification.write(json.dumps(tweets, indent=2))
    classification.close()
Exemple #30
0
def classify(classifier, featx, strings):
    print "Classify request"
    tokenizer = TweetTokenizer()
    mood = []
    for string in strings:
        string = Twitter.process_tweet(string)
        tokenized_text = [word.lower() for word in tokenizer.tokenize(string)]
        mood.append(classifier.classify(featx(tokenized_text)))
    return mood
Exemple #31
0
from nltk.tokenize import MWETokenizer, TweetTokenizer

limit = "limit 5000"  #limit number of results
Q_ALL = "select text from scraped order by scraped.date_posted " + limit

con = sqlite3.connect('baza.db')
cur = con.cursor()
cur.execute(Q_ALL)

data = cur.fetchall()
data = [a[0] for a in data]
data_string = " ".join(data)  #list of texts to string

#TweetTokenizer
tokenizer = TweetTokenizer(preserve_case=False)
tokens = tokenizer.tokenize(data_string)

#remove punctuation and links from tokens
fil = re.compile('.*[A-Za-z0-9].*')
urls = re.compile(
    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

tokens = [w for w in tokens if fil.match(w)]
tokens = [w for w in tokens if not urls.match(w)]

print("All tokens: %d" % len(tokens))
print("Unique tokens: %d" % len(set(tokens)))
print("Lexical diveristy: %f" % (len(set(tokens)) / len(tokens)))
fdist = nltk.FreqDist([t for t in tokens if len(t) > 4])
common = fdist.most_common(20)
Exemple #32
0
    
    
if __name__ == '__main__':
    
    count_lines = 0
    sents_with_ne = 0
    sents_with_comp_super = 0
    sents_with_sentiment = 0

    with open('/home/dasha/Документы/курс/prj-nlp-2020/tasks/02-structural-linguistics/data/examiner-headlines.txt', 'r') as f:
        lines = f.readlines()

        for i, line in enumerate(lines):
            count_lines += 1

            tokens = tknzr.tokenize(line)
            cleaned_tokens = [t for t in tokens if t not in string.punctuation] #and t.lower() not in stop_words]
            tagged_tokens = pos_tag(cleaned_tokens)

            if comparative_superlative(tagged_tokens) == 1:
                sents_with_comp_super += 1
            if find_ne(tagged_tokens) == 1:
                sents_with_ne += 1
            if get_sentiment(tagged_tokens) == 1:
                sents_with_sentiment += 1

    print('Sentences with Named Entities: {0}%\nSentences with sentiment: {1}%\nSentences with Adjectives/Adverbs: {2}%'.format((sents_with_ne/count_lines)*100, round((sents_with_sentiment/count_lines)*100, 2), (sents_with_comp_super/count_lines)*100))
    
>>> Sentences with Named Entities: 79.64%  
>>> Sentences with sentiment: 45.3%  
>>> Sentences with Adjectives/Adverbs: 4.06%
def convert_answers_to_words(text):
    tokenizer_words = TweetTokenizer()
    tokens_sentences = [tokenizer_words.tokenize(
        t) for t in nltk.sent_tokenize(text)]
    print(tokens_sentences)
    return tokens_sentences
Exemple #34
0
    text_raw = generate_training(num_train, num_add, path)
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    model_path = "lda_model/model"

    print("Total Number of ", len(text_raw), " Training Data Loaded")

    # Load stop words
    words_stop = stopwords.words('english')
    stop_set = set()
    for i in range(len(words_stop)):
        stop_set.add(Remove_Symbols(words_stop[i]))

    text_split = []
    for i in range(len(text_raw)):
        word_nonstop = []
        for word in tknzr.tokenize(text_raw[i]):
            if word not in stop_set:
                if len(word) <= 3:
                    continue

                word_nonstop.append(word)

        if len(word_nonstop) > 0:
            text_split.append(word_nonstop)

    # Word Dictionary
    dic = corpora.Dictionary(text_split)

    # Generate Corpus
    corpus = [dic.doc2bow(text) for text in text_split]
Exemple #35
0
def tokenizeTweet(s):
    tknzr = TweetTokenizer()
    s0 = s
    s1 = tknzr.tokenize(s0)
    return (s1)
Exemple #36
0
class Preprocessor(object):
    def __init__(self):
        self.tokenizer = TweetTokenizer()
        self.lemmatizer = WordNetLemmatizer()

    @staticmethod
    def remove_dates(comment):
        """
        Removes date time and time zone information from the comments
        """
        comment = comment.lower()
        comment = re.sub(
            """(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|september|oct|october|nov|november|dec|december)\s\d{1,2}\s\d{2,4}""",
            ' ', comment)
        comment = re.sub(
            """\d{1,2}\s(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|september|oct|october|nov|november|dec|december)\s\d{2,4}""",
            ' ', comment)
        comment = re.sub("""\d{1,2}:\d{1,2}""", ' ', comment)
        comment = re.sub("""utc""", ' ', comment)
        comment = " ".join(comment.split())
        return comment

    def clean_text(self, comment):
        """
        This function receives comments and returns clean word-list
        """
        # convert comment to lower case
        comment = comment.lower()

        # remove \n (new line characters)
        comment = re.sub("\\n", " ", comment)

        # remove URLs
        comment = re.sub(
            r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
            " ", comment)

        # remove ip addresses
        comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", comment)

        # remove usernames
        comment = re.sub("\[\[.*\]", " ", comment)

        # remove date time and time zone
        comment = self.remove_dates(comment)

        # remove repeating characters in a word ex: abbbbcd ==> abcd
        pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
        comment = pattern.sub(r"\1", comment)

        # remove repeating words ex: you said that that that ==> you said that
        comment = re.sub(r'(\W|^)(.+)\s\2', '', comment)

        # substitute regex patterns for vulgar words ex: f***k ==> f**k
        for target, patterns in RE_PATTERNS.items():
            for pat in patterns:
                comment = re.sub(pat, target, comment)

        # remove if there are any extra spaces in comment
        comment = " ".join(comment.split())

        # perform tokenization
        words = self.tokenizer.tokenize(comment)

        # (')aphostophe  replacement (ie)   you're --> you are
        words = [
            APOSTROPHE_MAP[word] if word in APOSTROPHE_MAP else word
            for word in words
        ]

        comment = " ".join(words)
        # remove special chars
        comment = re.sub(r"[^a-z0-9!#\$%\^\&\*_\-,\.\'()\/ ]", ' ', comment)

        # perform lemmatization
        words = [
            self.lemmatizer.lemmatize(word, "v") for word in comment.split()
        ]
        #     words = [w for w in words if not w in STOPWORDS]

        clean_sent = " ".join(words)
        # remove any non alphanum,digit character
        clean_sent = re.sub("\W+", " ", clean_sent)
        clean_sent = re.sub("  ", " ", clean_sent)
        return (clean_sent)
def processTweetText(raw_tweet_dict, sentiment_dict):

    ## Defining the dictionaries to be used
    emoticon_bag = defaultdict(dict)
    hash_tag_bag = defaultdict(dict)
    user_ref_bag = defaultdict(dict)
    tweet_dict = defaultdict(dict)
    tweet_cmplt = defaultdict(dict)
    sentiment_score_dict = defaultdict(dict)

    tweetSplitter = TweetTokenizer()
    sid = SentimentIntensityAnalyzer()

    ## Downloading Stop words from NLTK
    nltkStopWords = list(stopwords.words('english'))
    stopWords = list(get_stop_words('en'))
    stopWords.extend(nltkStopWords)

    wordDict = defaultdict()

    s_token = time.clock()

    for tweet_id in raw_tweet_dict:
        tweet = raw_tweet_dict[tweet_id]
        sentence = tweet.tweet_text
        sentiment_score = sid.polarity_scores(sentence)
        wordBag = tweetSplitter.tokenize(sentence.replace('RT ', ''))
        newWordBag = []
        emoticonList = []
        hashTagList = []
        userRefList = []

        for word_case in wordBag:
            word = word_case.lower()
            if word not in stopWords:
                if (word in sentiment_dict):
                    emoticonList.append(word)
                else:
                    if (word.startswith('@')):
                        userRefList.append(word.replace('@', ''))
                    else:
                        if (word.startswith('#')):
                            print(word)
                            hashTagList.append(word.replace('#', ''))
                        else:
                            if word.isalpha():
                                if word not in wordDict:
                                    wordDict[word] = 1
                                newWordBag.append(word)
        if len(newWordBag) > 3:
            tweet_dict[tweet_id] = newWordBag
            hash_tag_bag[tweet_id] = hashTagList
            emoticon_bag[tweet_id] = emoticonList
            user_ref_bag[tweet_id] = userRefList
            sentiment_score_dict[tweet_id] = sentiment_score
            tweet_cmplt[tweet_id] = sentence.replace('\n', ' ')

    final_dict = defaultdict(dict)

    final_dict['tweet_dict'] = tweet_dict
    final_dict['hash_tag_bag'] = hash_tag_bag
    final_dict['emoticon_bag'] = emoticon_bag
    final_dict['user_ref_bag'] = user_ref_bag
    final_dict['sentiment_score_dict'] = sentiment_score_dict
    final_dict['tweet_cmplt'] = tweet_cmplt
    final_dict['wordDict'] = wordDict

    e_token = time.clock()
    print('processTweetText() Time : ', e_token - s_token)

    return final_dict
#%%
import spacy
from nltk.tokenize import TweetTokenizer

nlp = spacy.load('en')
text = "Mary, don’t slsap the green witch"
print([str(token) for token in nlp(text.lower())])
tweet = u"Snow White and the Seven Degrees#MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))
#%%
class AutoToken(object):


    def __init__(self, trackTokens=False):
        from nltk.tokenize import TweetTokenizer
        self.tweetTokenizer = TweetTokenizer(reduce_len=True)
        self._reMention = re.compile('@\w+')
        self._numbers = frozenset('1,2,3,4,5,6,7,8,9,0'.split(','))
        self.stopWords = self._calculateStopwords()

        self.tokenTracker = defaultdict(set) if trackTokens else None

        self.tokenTransformers = OrderedDict([
            ('<URL>', self._isUrl),
            ('<BTC>', self._isBitcoin),
            ('<ALTCOIN>', self._isAltCoint),
            ('<INT_NUMBER>', self._isIntNumber),
            ('<FLOAT_NUMBER>', self._isFloatNumber),
            ('<MENTION>', self._isMention),
        ])


    def _calculateStopwords(self):
        engStopWords = set( stopwords.words('english') )
        engStopWords.update( ', . : ( ) " | [ ] \' *'.split(' ') )
        return engStopWords

    def __call__(self, tweet):

        for token in self.tweetTokenizer.tokenize(tweet):

            if token in self.stopWords:
                continue

            normlizedToken = self._normalizeToken( token )
            yield normlizedToken


    def _normalizeToken(self, token):

        for tokenType, normalizer in self.tokenTransformers.items():
            if normalizer( token ):

                if self.tokenTracker is not None:
                    self.tokenTracker[tokenType].add( token )

                return tokenType

        return token

    def _isUrl(self, token):
        return token.startswith('http') or token.startswith('www') or '.com' in token

    def _isBitcoin(self, token):
        return token.replace('#', '') in frozenset('btc,bitcoin'.split(','))

    def _isMention(self, token):
        return  self._reMention.match(token)

    def _isAltCoint(self, token):
        return token.replace('#', '') in frozenset('eth,ltc,ethereum,litecoin,altcoin'.split(','))

    def _preProcessNumber(self, token):
        token = token.replace(',', '').replace('#', '')
        if token and token[-1] in {'-', '+'} and token[0] in self._numbers:
            token = token[-1] + token[:-1]

        return token


    def _isFloatNumber(self, token):
        token = self._preProcessNumber(token)
        try:
            _ = float(token)
            return True

        except ValueError:
            pass

        return False

    def _isIntNumber(self, token):
        token = self._preProcessNumber(token)

        try:
            _ = int(token)
            return True

        except ValueError:
            pass

        return False
Exemple #40
0
        line = tweet['full_text']

    # Leading ja trailing kirjaimien poisto
    line = line.strip()
    # Linkkien poisto
    line = remove_links(line)

    # New line poisto
    line = line.replace('\n', ' ')

    # Non-ascii poisto
    line = NormalizeText.remove_nonascii(line)

    # Tokenize text
    tweet_sent = sent_tokenize(line)  # Tokenize sentences
    tweet_word = tTokenizer.tokenize(line)
    tweet_unique = list(set(tweet_word))  # Eliminate duplicated words

    # Analyse sentiment
    ss1 = TextBlob(line)
    ss2 = sid.polarity_scores(line)

    # Kumulaattorien päivitys
    tss1 += ss1.sentiment.polarity
    tss2 += ss2['compound']

    # lisätään tulokset JSON-tiedostoon
    tweet.update({
        'sentiment': {
            'textblob': ss1.sentiment.polarity,
            'nltk': ss2['compound']
# In[1]:

import sys
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer

# In[14]:

#que 1
datafile = pd.read_csv("tweets-dataset.csv")
total_token = list()
total_type = set()
tk = TweetTokenizer()
for sentence in datafile['Sentence']:
    word_list = tk.tokenize(sentence)
    for ele in range(len(word_list)):
        word = word_list[ele]
        total_token.append(word)
        total_type.add(word)

# In[15]:

print("token :", len(total_token))
print("type :", len(total_type))
ttr = len(total_type) / len(total_token)
print("ttr : ", ttr, sep="")

# In[23]:

#que 3
    "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which',
    'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would',
    "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours',
    'yourself', 'yourselves'
]

terms_all = []
users = []

for user in list(set(tweets_trimmed['query'])):
    user_sub = tweets_trimmed[tweets_trimmed['query'] == user]
    tweets = list(set(user_sub['content']))
    # unigrams only
    user_terms = []
    for tweet in tweets:
        terms = tknzr.tokenize(tweet.lower())
        for term in terms:
            if (not any(rr.search(term) for rr in remove_regex)) and (
                    term not in punctuation) and not (term.startswith('www')):
                if term not in en_stop:
                    stemmed_term = p_stemmer.stem(term)
                    user_terms.append(stemmed_term.encode('ascii', 'ignore'))
    terms_all.append(user_terms)
    users.append(user)

dictionary = corpora.Dictionary(terms_all)

corpus = [dictionary.doc2bow(term) for term in terms_all]

ldamodel = models.ldamodel.LdaModel(corpus,
                                    num_topics=200,
Exemple #43
0
    def test_remove_handle(self):
        """
        Test remove_handle() from casual.py with specially crafted edge cases
        """

        tokenizer = TweetTokenizer(strip_handles=True)

        # Simple example. Handles with just numbers should be allowed
        test1 = "@twitter hello @twi_tter_. hi @12345 @123news"
        expected = ['hello', '.', 'hi']
        result = tokenizer.tokenize(test1)
        assert result == expected

        # Handles are allowed to follow any of the following characters
        test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n."
        expected = [
            '`',
            '~',
            '(',
            ')',
            '-',
            '=',
            '+',
            '\\',
            '|',
            '[',
            ']',
            '{',
            '}',
            ';',
            ':',
            "'",
            '"',
            '/',
            '?',
            '.',
            ',',
            '<',
            '>',
            'ñ',
            '.',
            'ü',
            '.',
            'ç',
            '.',
        ]
        result = tokenizer.tokenize(test2)
        assert result == expected

        # Handles are NOT allowed to follow any of the following characters
        test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n"
        expected = [
            'a',
            '@n',
            'j',
            '@n',
            'z',
            '@n',
            'A',
            '@n',
            'L',
            '@n',
            'Z',
            '@n',
            '1',
            '@n',
            '4',
            '@n',
            '7',
            '@n',
            '9',
            '@n',
            '0',
            '@n',
            '_',
            '@n',
            '!',
            '@n',
            '@',
            '@n',
            '#',
            '@n',
            '$',
            '@n',
            '%',
            '@n',
            '&',
            '@n',
            '*',
            '@n',
        ]
        result = tokenizer.tokenize(test3)
        assert result == expected

        # Handles are allowed to precede the following characters
        test4 = "@n!a @n#a @n$a @n%a @n&a @n*a"
        expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a']
        result = tokenizer.tokenize(test4)
        assert result == expected

        # Tests interactions with special symbols and multiple @
        test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n"
        expected = [
            '!',
            '@n',
            '#',
            '@n',
            '$',
            '@n',
            '%',
            '@n',
            '&',
            '@n',
            '*',
            '@n',
            '@n',
            '@n',
            '@',
            '@n',
            '@n',
            '@',
            '@n',
            '@n_',
            '@n',
            '@n7',
            '@n',
            '@nj',
            '@n',
        ]
        result = tokenizer.tokenize(test5)
        assert result == expected

        # Tests that handles can have a max length of 20
        test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle"
        expected = ['uvwxyz', '1234', '_', 'endofhandle']
        result = tokenizer.tokenize(test6)
        assert result == expected

        # Edge case where an @ comes directly after a long handle
        test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
        expected = [
            'u',
            '@abcde',
            '@abcdefghijklmnopqrst',
            '@abcde',
            '_',
            '@abcde',
            '5',
            '@abcde',
        ]
        result = tokenizer.tokenize(test7)
        assert result == expected
    def f_create_data(self, args):
        self.m_min_occ = args.min_occ
        self.m_max_line = 1e5

        self.m_data_dir = args.data_dir
        self.m_data_name = args.data_name
        self.m_raw_data_file = args.data_file
        self.m_raw_data_path = os.path.join(self.m_data_dir,
                                            self.m_raw_data_file)

        self.m_vocab_file = self.m_data_name + ".vocab.json"
        ### to save new generated data
        self.m_data_file = "tokenized_" + self.m_data_name + ".pickle"

        data = pd.read_pickle(self.m_raw_data_path)
        train_df = data["train"]
        valid_df = data["valid"]

        tokenizer = TweetTokenizer(preserve_case=False)

        train_reviews = train_df.review
        train_item_ids = train_df.itemid
        train_user_ids = train_df.userid

        valid_reviews = valid_df.review
        valid_item_ids = valid_df.itemid
        valid_user_ids = valid_df.userid

        vocab_obj = _Vocab()

        self._create_vocab(vocab_obj, train_reviews)

        review_corpus = defaultdict(dict)
        item_corpus = defaultdict(dict)
        user_corpus = defaultdict(dict)
        user2uid = defaultdict()

        stop_word_ids = [
            vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>'])
            for w in stopwords.words()
        ]
        punc_ids = [
            vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>'])
            for w in string.punctuation
        ]

        print("loading train reviews")

        ss_time = datetime.datetime.now()

        non_informative_words = stop_word_ids + punc_ids
        print("non informative words num", len(non_informative_words))

        for index, review in enumerate(train_reviews):
            if index > self.m_max_line:
                break

            item_id = train_item_ids.iloc[index]
            user_id = train_user_ids.iloc[index]

            words = tokenizer.tokenize(review)

            word_ids = [
                vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words
            ]
            review_id = len(review_corpus['train'])
            review_obj = _Review()
            review_obj.f_set_review(review_id, word_ids, non_informative_words)

            review_corpus["train"][review_id] = review_obj

            if user_id not in user_corpus:
                user_obj = _User()
                user_obj.f_set_user_id(user_id)
                user_corpus[user_id] = user_obj

                user2uid[user_id] = len(user2uid)

            uid = user2uid[user_id]
            user_obj = user_corpus[user_id]
            user_obj.f_add_review_id(review_id)

            if item_id not in item_corpus:
                item_obj = _Item()
                item_corpus[item_id] = item_obj
                item_obj.f_set_item_id(item_id)

            review_obj.f_set_user_item(uid, item_id)

            item_obj = item_corpus[item_id]
            item_obj.f_add_review_id(review_obj, review_id)

        e_time = datetime.datetime.now()
        print("load training duration", e_time - ss_time)

        s_time = datetime.datetime.now()

        user_num = len(user_corpus)
        vocab_obj.f_set_user(user2uid)

        save_item_corpus = {}

        print("item num", len(item_corpus))

        print("loading valid reviews")
        for index, review in enumerate(valid_reviews):

            if index > self.m_max_line:
                break

            item_id = valid_item_ids.iloc[index]
            user_id = valid_user_ids.iloc[index]

            words = tokenizer.tokenize(review)

            word_ids = [
                vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words
            ]

            review_id = len(review_corpus["valid"])

            review_obj = _Review()
            review_obj.f_set_review(review_id, word_ids, non_informative_words)

            review_corpus["valid"][review_id] = review_obj

            uid = user2uid[user_id]
            review_obj.f_set_user_item(uid, item_id)

            item_obj = item_corpus[item_id]
            # print(len(item_corpus))
            item_obj.f_get_RRe(review_obj)

        save_data = {
            "item": save_item_corpus,
            "review": review_corpus,
            "user": user_num
        }

        print("save data to ", self.m_data_file)
        data_pickle_file = os.path.join(self.m_data_dir, self.m_data_file)
        f = open(data_pickle_file, "wb")
        pickle.dump(save_data, f)
        f.close()

        vocab = dict(w2i=vocab_obj.m_w2i,
                     i2w=vocab_obj.m_i2w,
                     user2uid=vocab_obj.m_user2uid)
        with io.open(os.path.join(self.m_data_dir, self.m_vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))
Exemple #45
0
def judgeWords(infile):
    # read word list
    readWordlist()
    # tweetlist = ['physics', 'chemistry', 1997, 2000]

    # If it becomes true then @ occur
    start = False
    with open(infile) as f1:

        # for all tweet
        twitterList = []
        spamreader = csv.reader(infile, delimiter=' ')
        rows = f1.readlines()
        for row in rows:
            row = row.replace(".", '')

            # row = "feels so lovely to be back in the can and she loves you #usa brothermerl looooooooove good bad lol @user @ Navarro College"
            tknzr = TweetTokenizer()
            text = tknzr.tokenize(row)
            tempSen = nltk.pos_tag(text)

            splitrow = row.split()
            # print(splitrow)
            # tweetlist = splitrow
            tweetlist = []
            length = len(splitrow)
            # print(length)

            # enumerate(row, start=0)
            for index, word in enumerate(splitrow, start=0):
                checkPos(word)
                checkNeg(word)
                # print(word)
                # length = len(splitrow)
                # print(index)

                if ((start == True) and (word != "@")):
                    tempWord = tempWord + " " + word
                    # print(tempWord)
                    if ((start == True) and (index == length - 1)):
                        judgelist = [tempWord, "@"]
                        tweetlist.append(judgelist)
                        # print("#######")

                        # print("start t end f")

                elif ((word == "@") and (start == True)):
                    # start = False
                    # end = True
                    #judgelist = [tempWord, "@"]
                    flagAt = True
                    #tweetlist.append(judgelist)
                    tempWord = word
                    # print(word)

                    start = True

                elif ((word == "@") and (start == False)):
                    global flagAt
                    flagAt = True
                    tempWord = word
                    start = True

                    # handle # tag
                    # elif(word[0] == "#"):
                    #    word = word[1:]
                    #   word = word

                elif (word == "@user"):
                    pass
                    # do not add to list

                else:
                    if jstop(word):
                        a = "ST"
                    elif nonEnW(word):
                        a = "NE"
                    else:
                        a = tempSen[index][1]

                        ###
                        # AB NN VB JJ
                        # change to four type
                        ###
                        if (a == "NNS"):
                            # NNS and NN would be NN
                            a = "NN"
                        if (a == "VBZ"):
                            # VB and VBZ would be VB
                            a = "VB"
                        if (a == "RB"):
                            # RB would be AB
                            a = "AB"
                        if (a == "VBN"):
                            a = "VB"
                        if (a == "NNP"):
                            a = "NN"
                        if (a == "VBD"):
                            a = "VB"
                        if (a == "VBP"):
                            a = "VB"
                        if (a == "CD"):
                            a = "NN"
                        if (a == "JJR"):
                            a = "JJ"
                        if (a == "VBG"):
                            a = "VB"
                        if (a == "JJS"):
                            a = "JJ"

                    judgelist = [word, a]
                    tweetlist.append(judgelist)
                    # b = np.append(b, judgelist)
                    # tweetlist = tweetlist.append(judgelist)

            # repeat feature
            if flagRepeted == True:
                judgelist = [1, "FT"]
                #tweetlist.append(judgelist)
            else:
                judgelist = [0, "FT"]
                #tweetlist.append(judgelist)

            # slang feature
            if flagSlang == True:
                judgelist = [1, "FT"]
                #tweetlist.append(judgelist)
            else:
                judgelist = [0, "FT"]
                #tweetlist.append(judgelist)

            # pos feature
            if flagPos == True:
                judgelist = [1, "FT"]
                #tweetlist.append(judgelist)
            else:
                judgelist = [0, "FT"]
                #tweetlist.append(judgelist)

            # neg feature
            if flagPos == True:
                judgelist = [1, "FT"]
                #tweetlist.append(judgelist)
            else:
                judgelist = [0, "FT"]
                #tweetlist.append(judgelist)

            # @ feature
            if flagAt == True:
                judgelist = [1, "FT"]
                #tweetlist.append(judgelist)
            else:
                judgelist = [0, "FT"]
                #tweetlist.append(judgelist)

            # print(tweetlist)
            start = False
            iniFlag()

            twitterList.append(tweetlist)
        return twitterList
fileList = (list(iter_rows(worksheet)))
fileList1 = (list(iter_rows(worksheet1)))
geoLocation = []
tweet = []

for col in fileList:
    geoLocation.append(col[5])  # 1 is column index
    tweet.append(col[6])  # 2 is column index
for col in fileList1:
    sentimental_dictionary[col[0]] = col[1]

counter = 2
tweetgrade = {}
for i in tweet:
    currentTweet = tknzr.tokenize(i.casefold())
    currentTweet = [
        word for word in currentTweet if word not in cachedStopWords
    ]
    currentTweet = [word for word in currentTweet if word in english_vocab]
    currentTweet = [s.strip('.') for s in currentTweet]
    currentTweet = [s.replace('.', '') for s in currentTweet]
    currentTweet = [s.strip('#') for s in currentTweet]
    currentTweet = [s.replace('#', '') for s in currentTweet]
    currentTweet = [s.strip(':') for s in currentTweet]
    currentTweet = [s.replace(':', '') for s in currentTweet]
    currentTweet = [s.strip('!') for s in currentTweet]
    currentTweet = [s.replace('!', '') for s in currentTweet]
    currentTweet = [s.strip('?') for s in currentTweet]
    currentTweet = [s.replace('?', '') for s in currentTweet]
    currentTweet = [s.strip('\\') for s in currentTweet]
def tokenize(tweets):
    tknzr = TweetTokenizer()
    tokenized_tweets = []
    for tweet in tweets:
        tokenized_tweets.append(tknzr.tokenize(tweet))
    return tokenized_tweets
Exemple #48
0
# array that contains all words
all_words = []

# list of stopwords to filter out
STOPWORDS = set(stopwords.words('english')) - set(BLACKLIST_STOPWORDS)
longest_tweet = 0
spoiler_counter = 0
nonspoiler_counter = 0
tweet_lengths = []

with open(FILE_NAME, 'r', newline='') as file:
    reader = csv.reader(file)
    tweet_tok = TweetTokenizer()
    for row in reader:
        tweet = []
        words = tweet_tok.tokenize(row[0])
        for word in words:
            if word.lower() not in STOPWORDS:
                if not re.match(PUNCTUATION, word.lower()):
                    if word.lower() in CONTRACTIONS:
                        # print("contraction changing:", word, 'to', CONTRACTIONS[word.lower()])
                        for expanded_word in word_tokenize(CONTRACTIONS[word.lower()]):
                            # add lowercase version of word to all_words array
                            if expanded_word not in STOPWORDS:
                                print("Success!")
                                tweet.append(expanded_word)
                            else:
                                print('removing:', expanded_word)
                    else:
                        tweet.append(word.lower())
        if len(tweet) > longest_tweet:
Exemple #49
0
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"
    u"\U00002600-\U000027BF"
    u"\U0001f300-\U0001f64F"
    u"\U0001f680-\U0001f6FF"
    u"\u2600-\u27BF"
    "]+",
    flags=re.UNICODE)
wordnet_lemmatizer = WordNetLemmatizer()
for t in tweets:
    stop = set(stopwords.words('english'))  #stop words!
    framents = tknz.tokenize(t)
    clean_fragments = []
    for f in framents:
        if f not in stop:  # not included in the stop words
            f = emoji_pattern.sub(r'', f)
            f = f.lower()  #lowercase fragment
            f = re.sub(r'[.,"!~_:|?\']+', '', f,
                       flags=re.MULTILINE)  # Special characters
            f = re.sub(r'\.\.\.', '', f, flags=re.MULTILINE)  # 3 dots
            f = re.sub(url_expression, '', f, flags=re.MULTILINE)  # links
            f = re.sub(r'@[a-z,A-Z,0-9 ]*', '', f,
                       flags=re.MULTILINE)  #clean at person references
            f = re.sub(r'RT @[a-z,A-Z]*: ', '', f,
                       flags=re.MULTILINE)  #Remove retweets
            f = wordnet_lemmatizer.lemmatize(f)
            if f:
from sklearn.feature_extraction.text import TfidfVectorizer

tweetFile = pd.read_csv("Tweets-Data.csv")
dataFrame = pd.DataFrame(tweetFile[['tweet_data']])
tweetData = tweetFile['tweet_data']

tknzr = TweetTokenizer()
stopWords = set(stopwords.words("english"))

# words = word_tokenize(data[0]) #For 1 line

cleanedData = []
cleaned = []

for line in tweetData:
    tweet = tknzr.tokenize(str(line))

    for word in tweet:
        if word not in string.punctuation:
            if '@' not in word:
                cleaned.append(word)

    cleanedData.append(cleaned)
    cleaned = []

sentencedData = []

for sentence in cleanedData:
    sentencedData.append(" ".join(sentence))

tweetFile.insert(4, "clean_data", "")
Exemple #51
0
def preprocessing(dfs, candidate_list):
    '''
    Tokenize tweets - transforms a sentence into separate words
    Loops through Dataframe Dictionary to:
        - Turns every word into lowercase
        - remove symbols such as .?:;
        - removes hashtags and mentions
        - creates an exclusive column for hashtags and mentions
        - removes stopwords (connectors, prepositions)
    '''

    tt = TweetTokenizer()

    Stop_Words_Spacy = list(STOP_WORDS)
    Stop_Words_NLTK = list(stopwords.words('portuguese'))

    All_Stop_Words = list(set(Stop_Words_NLTK + Stop_Words_Spacy + [',']))

    print('Preprocessing the Data...')

    for key in dfs:

        # Tokenizing - takes a phrase and isolate each word
        dfs[key]['token_list'] = dfs[key].apply(lambda x: tt.tokenize(x.text),
                                                axis=1)

        # Dropping unnecessary labels
        dfs[key].drop(labels=['id', 'datetime', 'created_at'],
                      axis=1,
                      inplace=True)

        # Lowering words
        dfs[key]['token_list'] = [[word.lower() for word in lists]
                                  for lists in dfs[key].token_list]

        # Removing Stop Words - connectives, prepositions...
        dfs[key]['token_list'] = [[
            word for word in lists if word not in All_Stop_Words
        ] for lists in dfs[key].token_list]

        # Separating Hashtags
        dfs[key]['Hashtag'] = [[
            word[1:] for word in lists if re.match('#', word) is not None
        ] for lists in dfs[key].token_list]

        # Separating Twitter Mentions
        dfs[key]['Mentions'] = [[
            word[1:] for word in lists if re.match('@', word) is not None
        ] for lists in dfs[key].token_list]

        # Removing Links, Hashtags and Mentions
        pattern_twitter = '((https)|@|#)'
        dfs[key]['token_list'] = [[
            word for word in lists if re.match(pattern_twitter, word) is None
        ] for lists in dfs[key].token_list]

        # Removing all symbols
        pattern_words_numbers = '[àÀáÁéÉçôõãúÚíÍóÓ, 0-9a-zA-Z]+'
        dfs[key]['token_list'] = [[
            word for word in lists
            if re.match(pattern_words_numbers, word) is not None
        ] for lists in dfs[key].token_list]
Exemple #52
0
class Data:
    def __init__(self):
        self.matrix = []
        self.truth = []
        self.tokenizer = TweetTokenizer()

    def clear(self):
        self.matrix = []
        self.truth = []

    def add_data_sample(self, sample, author):
        #Normalization

        #Stilometric features
        "----------------------------------------------------------------"
        sample = numpy.append(sample, self.hapax_legomenom_author(author))
        sample = numpy.append(sample, self.average_word_per_tweet(author))
        sample = numpy.append(sample, self.number_of_words(author))
        sample = numpy.append(sample, self.longest_word(author))
        sample = numpy.append(sample, self.average_length_of_word(author))
        sample = numpy.append(sample, self.four_letter_words(author))
        sample = numpy.append(sample, self.five_letter_words(author))
        sample = numpy.append(sample, self.six_letter_words(author))
        sample = numpy.append(sample, self.seven_letter_words(author))
        sample = numpy.append(sample, self.misspelled_words(author))
        "------------------------------------------------------------------"

        normalized_sample = preprocessing.normalize([sample])
        sample = numpy.array(normalized_sample[0])
        self.matrix.append(sample)

    def add_gender(self, sample):
        if (sample == 'F'):
            self.truth.append(0)
        else:
            self.truth.append(1)

    def add_age(self, sample):
        if (sample == "18-24"):
            self.truth.append(0)
        elif (sample == "25-34"):
            self.truth.append(1)
        elif (sample == "35-49"):
            self.truth.append(2)
        else:
            self.truth.append(3)

    def add_personality(self, sample):
        self.truth.append(sample)

    def baseline(self, authors, truth, category):
        for author in authors:
            author_id = author.attrib['id']
            author_truth = truth[author_id]
            for tweet in author:
                tokenized_tweet = self.tokenizer.tokenize(tweet.text)
                sum = numpy.zeros(300)
                for word in tokenized_tweet:
                    if (word in model.vocab and word not in stop_words):
                        sum += model[word]

                self.add_data_sample(sum, author)
                if (category == "gender"):
                    self.add_gender(author_truth)
                elif (category == "age"):
                    self.add_age(author_truth)
                else:
                    self.add_personality(author_truth)

    #author version
    def hapax_legomenom_author(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet
        return len(set(list_of_words))

    def average_word_per_tweet(self, author):
        list_of_words = []
        number_of_tweets = 0
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet
            number_of_tweets += 1
        return (len(list_of_words) / number_of_tweets)

    def number_of_words(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        return len(list_of_words)

    def longest_word(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        max = 0
        for word in list_of_words:
            if (len(word) > max):
                max = len(word)
        return max

    def average_length_of_word(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        len_sum = 0
        for word in list_of_words:
            len_sum += len(word)

        return len_sum / len(list_of_words)

    def four_letter_words(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        less_then_four_letter_words = 0
        for word in list_of_words:
            if (len(word) < 4):
                less_then_four_letter_words += 1

        return less_then_four_letter_words / len(list_of_words)

    def five_letter_words(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        less_then_five_letter_words = 0
        for word in list_of_words:
            if (len(word) > 5):
                less_then_five_letter_words += 1

        return less_then_five_letter_words / len(list_of_words)

    def six_letter_words(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        less_then_six_letter_words = 0
        for word in list_of_words:
            if (len(word) > 6):
                less_then_six_letter_words += 1

        return less_then_six_letter_words / len(list_of_words)

    def seven_letter_words(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        less_then_seven_letter_words = 0
        for word in list_of_words:
            if (len(word) > 7):
                less_then_seven_letter_words += 1

        return less_then_seven_letter_words / len(list_of_words)

    def misspelled_words(self, author):
        list_of_words = []
        for tweet in author:
            tokenized_tweet = self.tokenizer.tokenize(tweet.text)
            list_of_words = list_of_words + tokenized_tweet

        number_of_misspelled_words = 0
        for word in list_of_words:
            if (word not in model.vocab):
                number_of_misspelled_words += 1

        return number_of_misspelled_words / len(list_of_words)
Exemple #53
0

# Tweet loading and cleaning
wrong = 0
with open('neg.txt', 'r', encoding='utf8') as f:
    negtweets = []
    for line in f.readlines():
        tweet = line.replace('\n', '')
        # Removal of URLs, hashtags and mentions
        tweet_regex = regex_spaces.sub(
            ' ', regex_ht_mn.sub('', regex_url.sub('', tweet))).lower()
        # Removal of caps and accents
        tweet_raw = unidecode.unidecode(tweet_regex).lower()
        tokens = [
            remove_repeated_chars(stemmer.stem(t))
            for t in tweet_tokenizer.tokenize(tweet_regex)
            if not t in stopwords and not regex_nonword.match(t)
        ]
        negtweets.append(([tokens, 'neg']))

with open('pos.txt', 'r', encoding='utf8') as f:
    postweets = []
    for line in f.readlines():
        tweet = line.replace('\n', '')
        # Removal of URLs, hashtags and mentions
        tweet_regex = regex_spaces.sub(
            ' ', regex_ht_mn.sub('', regex_url.sub('', tweet))).lower()
        # Removal of caps and accents
        tweet_raw = unidecode.unidecode(tweet_regex).lower()
        tokens = [
            remove_repeated_chars(stemmer.stem(t))
Exemple #54
0
# 3장 전처리 - 토큰화-NLTK 내장 토크나이저 사용법
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

# LineTokenizer 사용('줄'로 나누기)
lTokenizer = LineTokenizer()
print(
    "Line toknizer 출력 :",
    lTokenizer.tokenize(
        "My name is" +
        "Maximus Decimus Meridius, commander of the Armies of the North, " +
        "General of the Felix Legions and loyal servant to the true emperor," +
        "Marcus Aurlius. \nFather to a murdered son, husband to a murdered" +
        "wife. \nAnd I will have my vengeance, in this life or the next."))

# SpaceTokenizer 사용('공백 문자'로 나누기)
rawText = "By 11 o'clock on sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText))

# word_tokenize 사용('단어'와 '구두점' 나누기)
print("word Tokenizer 출력 :", word_tokenize(rawText))

# TweetTokenizer 사용('특수문자'를 다룰 때 사용)
tTokenizer = TweetTokenizer()
print("Tweet Tokenizer 출력 :",
      tTokenizer.tokenize("This is a coooool" + "#dummysmiley: :-) :-P <3"))
            outfile.write(',')
    outfile.write('\n')

with open('predict_handles.txt', 'r') as handle_file:
    # Read usernames from file
    handles = handle_file.read().split('\n')
    for handle in handles[:-1]:
        # Call Twitter API
        os.system('python get_status.py ' + handle)

        # Tokenize text
        text = []
        with open('statuses/statuses_' + handle + '_output.txt',
                  'r',
                  encoding='utf8') as tweet_text:
            text = tokenizer.tokenize(tweet_text.read().replace(
                '\nTWEETLINEBREAK\n', ' '))

        # Turn into features
        X = fill_features(text)

        # Score
        scores = []
        print(X.shape)
        for i in range(len("ocean")):
            # Print the second element of the probability array because it's the '1' prob
            scores.append(str(float(mnbs[i].predict_proba(X)[0][1])))

        # Turn the scores into marketing segments
        with open('output/prediction_service_pca_segments.csv',
                  'a') as outfile:
            outfile.write(handle + ',')
Exemple #56
0
 def apostrophe_tokenize(self, word):
     """
     Handles the tokenization of apostrophes correctly
     """
     aposToken = TweetTokenizer()
     return aposToken.tokenize(word)
Exemple #57
0
positive_tweets = positive_tweets_refined
negative_tweets = negative_tweets_refined

for i in range(len(positive_tweets)):
	sentiment_tweets.append((positive_tweets[i],'positive'))

for i in range(len(negative_tweets)):
	sentiment_tweets.append((negative_tweets[i],'negative'))

random.shuffle(sentiment_tweets)


tweets =  positive_tweets+negative_tweets

for tweet in tweets:
	for word in tknz.tokenize(tweet):
		if word.lower() not in stop_words and not word.startswith('https'):
			all_words.append(word.lower())


all_words = nltk.FreqDist(all_words)
# print(all_words['awesome'])
all_words = (all_words.most_common(2000))
# print(all_words['awesome'])


# for word, freq in all_words:
# 	print(word.encode('utf-8'),freq)

word_features = [x[0] for x in all_words]
def tokenize(text):
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)
Exemple #59
0
# nltk demo
# Reduce input text to function words.
#
# Mustafa Hussain
# also Daren Thomas and Stephen Falk:
# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
#
# Copyright: CC BY-NC 4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
#
# Must download stopwords first. Run in Python shell:
# nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

# example: hulk would like to smash the house please
#     becomes "hulk would like smash house please"
#
# example: "to be or not to be that is the question" becomes "question"

tkzr = TweetTokenizer()

while True:
    in1 = input("hulk) ")
    word_list = tkzr.tokenize(in1)  #in1.split(" ")

    filtered_words = [
        word for word in word_list if word not in stopwords.words('english')
    ]
    print(" ".join(filtered_words).upper())
Exemple #60
0
    def __init__(self,DIR,format, content,column, source='unspecified'):

            self.DIR = DIR
                
            if format == 'URL':
                html = urlopen(content).read()
                soup = BeautifulSoup(html, 'html.parser')

                # kill all script, and style elements
                for script in soup(["script", "style","h1","h2","h3",
                                    "h4","h5","a","span","label","button"]):
                    script.extract()    # rip it out

                # get text
                text = soup.get_text()
                # break into lines and remove leading and trailing space on each
                lines = (line.strip() for line in text.splitlines())
                # break multi-headlines into a line each
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                # drop blank lines
                self.text = '\n'.join(chunk for chunk in chunks if chunk)
                sentences = sent_tokenize(self.text)

            elif format == 'file':
                Array = []
                with open(content,'r',encoding="utf-8") as f:
                    reader = csv.reader(f)
                    try:
                        for row in reader:
                            Array.append(row)
                    except Exception as e:
                        print(e)
                
                df = pd.DataFrame(Array[1:],columns=Array[0])
                #df = pd.read_csv(content,encoding="utf-8")
                sentences = df[column].dropna().astype('str').tolist()
                self.text = '\n'.join(df[column].astype('str').tolist())


            # save phrases
            
            # tokenize the sentence
            phrases = [] # unofficial way to do that!
            regex = re.compile('[%s%s]' % (string.punctuation,'|\"\',\t\n’”“'))
            for item in sentences:
                for i in regex.split(item):
                    if i != '' and i.isdigit() != True and len(i)>20:
                        phrases.append(i.lower())
            phrases.insert(0,'Phrase')
            fname_phrases = self.DIR + '/sentence.csv'
            with open(fname_phrases, "w", newline='') as f:
                for item in phrases:
                    try:
                        f.write("{}\n".format(item)) 
                    except UnicodeEncodeError:
                        pass
            print(fname_phrases)


            # tokenize the word
            if format == 'URL':
                self.tokens = [wordpunct_tokenize(t) for t in sentences]
            elif format == 'file':
                if source == 'twitter':
                    tknz = TweetTokenizer()
                elif source == 'reddit':
                    tknz = tokenizer.RedditTokenizer()
                self.tokens = [tknz.tokenize(t) for t in sentences]
                
           
            # nltk's stopwords are too weak
           
            with open(os.path.dirname(__file__)+'/stopwords_en.txt','r') as f:
                stopwords2 = f.read().split('\n')
                
            with open(os.path.dirname(__file__)+'/twitter-customized.txt','r') as f:
                stopwords3 = f.read().split(',')

            self.filtered_tokens_lower = []
            self.filtered_tokens = []
            for token in self.tokens:
                self.filtered_tokens.append([word for word in token if (word.lower() not in stopwords.words('english')) #nltk stopwords
                                             and (word.lower() not in stopwords2) # third party stopwors:https://sites.google.com/site/kevinbouge/stopwords-lists
                                             and (word.isdigit() == False)      # no numbers
                                             and (word.isalnum() == True )      # only english characters
                                             and (word.lower() not in stopwords3) ])  # twitter specific stopwordshttps://sites.google.com/site/iamgongwei/home/sw
                self.filtered_tokens_lower.append([word.lower() for word in token if (word.lower() not in stopwords.words('english'))
                                             and (word.lower() not in stopwords2)
                                             and (word.isdigit() == False)
                                             and (word.isalnum() == True )
                                             and (word.lower() not in stopwords3) ])

            fname_filtered = self.DIR + '/tokenized.csv'
            with open(fname_filtered, "w", newline='') as f:
                writer = csv.writer(f)
                try:
                    writer.writerows(self.filtered_tokens_lower)
                except UnicodeEncodeError:
                    pass
            print(fname_filtered)