コード例 #1
0
def reasoning(dList):
	reasonList = []
	tokenizer = TweetTokenizer()
	for tweet in dList:
		print tweet
		# tokenize
		words = tokenizer.tokenize(tweet)
		# get POS tag
		pos_tokens = pos_tag(words)
		# get name entities
		tree = ne_chunk(pos_tokens, binary = False)
		# find relations
		pairs = relextract.tree2semi_rel(tree)
		# get interesting name entities
		reason = []
		for s, tree in pairs:
			reasonStr = ("%s") % tree
			reasonStr = reasonStr.split(" ")
			label = reasonStr[0].replace("(","").strip()
			content = ""
			for wordTag in reasonStr[1:]:
				sp = wordTag.split("/")
				word = sp[0].replace("(","")
				print word
				# content.append(word)
				content += (word + " ")
			# reason: [(label, content)]
			reason.append({"label": label, "content": content})
		# reasonList [reason]
		if len(reason) > 0:
			reasonList.append({"reason": reason})
		print str(len(reasonList)) + "/" + str(len(dList))
	return reasonList
コード例 #2
0
ファイル: ontology.py プロジェクト: xR86/ml-stuff
def nltk_tokenize(text):
    tokens = []

    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)

    return tokens
コード例 #3
0
ファイル: tag.py プロジェクト: smilligan93/undergrad
 def _tag_text(self, tweet_text):
     tokenizer = TweetTokenizer()
     tokens = tokenizer.tokenize(tweet_text)
     tagged = nltk.pos_tag(tokens)
     entities = nltk.chunk.ne_chunk(tagged)
     neList = traverse(entities)
     return neList
コード例 #4
0
def process_tweets(file_name):
    '''
    Person Responsible: Devin Munger

    + file_name: filename of tweets as returned from API based on query
   
    Extract text from file; return dataframe with tweet text, id
    '''
    ## Create empty dataframe
    tweet_df = pd.DataFrame(columns = ["text", "id"])

    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True)
    ## Read each JSON from file
    with open(file_name) as data_file:
        for entry in data_file.readlines():
            tweet = json.loads(entry)
            tweet_id = str(tweet.get("id", ""))
            text = tweet.get("text", "")
            ## Remove links from text
            text = re.sub(r"http\S+", "", text)
            ## Remove twitter keywords
            text.replace("RT ", "")
            ## Remove handle, punctuation from tweet text
            text_words = filter(lambda x: x not in string.punctuation, tokenizer.tokenize(text))
            ## Add tweet to dataframe
            tweet_df.loc[len(tweet_df)] = [" ".join(text_words), tweet_id]
    return tweet_df
def load_csv():
    with open('Tweets.csv', 'rb') as csvfile:
        reader = csv.DictReader(csvfile)
        count = 1

        reviews = []
        stars = []
        tknzr = TweetTokenizer()
        for row in reader:
            try:
                words=tknzr.tokenize(row['text'])
                label = 'SENT_%s' % count

                #print label
               # TaggedDocument(utils.to_unicode(row['text']).split(), [label])
                # print "label:", label
                #labels = [label]
                #lab_sent = LabeledSentence(words, label)
                #print lab_sent
                #reviews.append(TaggedDocument(utils.to_unicode(row['text']).split(), [label]))
                reviews.append(TaggedDocument(words, [label]))
                stars.append(row['airline_sentiment'])
                count += 1
            except:
                continue

    print "final count:", count
    return reviews, stars
コード例 #6
0
def load_data_and_labels_gameforum():
    # load
    with open("./input/gameforum-1000.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')]

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[0] for entry in dataset]
    x_text = [clean_str(post) for post in x_text]
    x_text = [tk.tokenize(post) for post in x_text]

    # generate y
    y = [entry[1] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1':  # positive
            y[idx] = [1, 0, 0]
        elif label == '2':  # neutral
            y[idx] = [0, 1, 0]
        elif label == '3':  # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in gameforum: ' + label

    return [x_text, y]
コード例 #7
0
def load_tweetkeywords():
    """
    Check and see which keywords are used in each tweet, and load the association
    table linking tweets and keywords
    """

    # TweetKeyword.query.delete()

    tweets = Tweet.query.all()
    keyword_query = Keyword.query.all()
    keywords = []
    [keywords.append(word.keyword) for word in keyword_query]

    tknzr = TweetTokenizer()

    for tweet in tweets:
        tokenized_tweets = tknzr.tokenize(tweet.text)

        for token in tokenized_tweets:
            if token in keywords:
                tweet_id = Tweet.query.filter(Tweet.tweet_id == tweet.tweet_id).one()
                keyword_id = Keyword.query.filter(Keyword.keyword == token).one()
                tweet_keyword = TweetKeyword(keyword_id=keyword_id.keyword_id, tweet_id=tweet_id.tweet_id)
                print "Added to TweetKeyword table: {}".format(tweet_keyword.keyword_id)
                db.session.add(tweet_keyword)

    db.session.commit()
コード例 #8
0
ファイル: msg.py プロジェクト: nicolay-r/tone-classifier
    def parse(self, text):

        # Tokenize message
        tokenizer = TweetTokenizer()
        words = tokenizer.tokenize(text)

        retweet_term = 'RT'

        urls = []
        users = []
        hash_tags = []
        for word in words:
            if (word[0] == '@'):
                # user in Twitter
                users.append(word)
            elif (word[0] == '#'):
                # hash tags
                hash_tags.append(word)
            elif (word.find('http:') == 0 or word.find('https:') == 0):
                # url
                urls.append(word)

        for f in urls + users + hash_tags + [retweet_term]:
            if f in words:
                words.remove(f)

        self.words = words
        self.urls = urls
        self.users = users
        self.hash_tags = hash_tags
コード例 #9
0
def format_text(entries, LSTM_shape=True):
	THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
	sentences = []
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	decoded = base64.b64decode(entries)
	decoded = str(decoded)
	decoded = decoded[2:]
	decoded = decoded[:-1]
	decoded = decoded.split(".")
	#print(decoded, "is decoded")
	for entry in decoded:
		token_sentences = tokenizer.tokenize(entry)
		for sentence in token_sentences:
			sentences.append(sentence)

	tokenized_sentences = []
	#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
	#remove_tokens = string.punctuation
	remove_tokens = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
	stop_words = set(stopwords.words('english'))
	tweet_tknzr = TweetTokenizer()
	for sentence in sentences:
		tokens = tweet_tknzr.tokenize(sentence)
		tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
		tokenized_sentences.append(tokens)

	all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
	all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
	all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
	#once the model gets updated with good data, ngrams.py needs to get changed/updated too!

	X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 1)
		for gram in my_ngrams:
			if gram in all_ngrams1:
				index = all_ngrams1[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 2)
		for gram in my_ngrams:
			if gram in all_ngrams2:
				index = len(all_ngrams1) + all_ngrams2[gram]
				X[i][index] = 1
	for i in range(len(tokenized_sentences)):
		sentence = tokenized_sentences[i]
		my_ngrams = ngrams(sentence, 3)
		for gram in my_ngrams:
			if gram in all_ngrams3:
				index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
				X[i][index] = 1


	if LSTM_shape:
		X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
	else:
		X = np.reshape(X, (X.shape[0], X.shape[1]))
	return X
コード例 #10
0
def preprocess_db():
    tkn = TweetTokenizer()
    photos = pd.read_pickle(r'./data/restaurant_photos_with_labels.pkl')
    img_path = r'./data/restaurant_photos/'
    sentid = 1
    img_list = []

    # Split data in such a way that labels are evenly distributed between 6 folds
    skf = StratifiedKFold(photos['label'], n_folds=6)

    folds = []
    # Initialize all images to train dataset initially
    photos['split'] = ['train' for i in range(len(photos))]

    # Obtain the indices for the test and validation splits and change value appropriately
    for _, test_ix in skf:
        folds.append(test_ix)
    photos.split[folds[0]] = 'test'
    photos.split[folds[1]] = 'val'

    # Obtain the information from each picture and move the pictures to the appropriate dir. The images are renamed.
    for i, photo_id in enumerate(photos.photo_id):
        img_dict = dict()
        img_dict['sentids'] = [sentid]
        img_dict['business_id'] = photo_id.business_id[i]
        if photos.split[i] in ['train']:
            img_dict['filepath'] = u'train'
            img_dict['imgid'] = 0
            img_dict['split'] = u'train'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/train/' + str(sentid).zfill(6) + '.jpg')
        elif photos.split[i] in ['test']:
            img_dict['filepath'] = u'test'
            img_dict['imgid'] = 0
            img_dict['split'] = u'test'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/test/' + str(sentid).zfill(6) + '.jpg')
        else:
            img_dict['filepath'] = u'val'
            img_dict['imgid'] = 0
            img_dict['split'] = u'val'
            shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/val/' + str(sentid).zfill(6) + '.jpg')
        img_dict['label'] = photos.label[i]
        caption_dict = dict()
        if photos.caption[i]:
            # Tokenize the captions
            caption_dict['tokens'] = tkn.tokenize(photos.caption[i])
            caption_dict['raw'] = photos.caption[i]
        else:
            caption_dict['tokens'] = 'None'
            caption_dict['raw'] = 'None'
        caption_dict['imgid'] = 0
        caption_dict['sentid'] = sentid
        img_dict['sentences'] = [caption_dict]
        img_dict['photoid'] = sentid
        img_dict['yelpid'] = photo_id
        img_list.append(img_dict)
        sentid += 1

    # Store the new dataset as a JSON file
    with open("./data/image_caption_dataset.json", "w") as outfile:
        json.dump(img_list, outfile)
コード例 #11
0
ファイル: main.py プロジェクト: tentangdata/bilp-heroku
def check():
	check_id = request.args.get("id")
	if check_id is not None:
		check_sentence = Sentence.query.get(check_id)
		if check_sentence is not None:
			Word.query.filter_by(sentence_id=check_id).delete()
			tweet_tokenizer = TweetTokenizer()
			tokens = tweet_tokenizer.tokenize(check_sentence.text)
			for token in tokens:
				url = "http://kateglo.com/api.php?format=json&phrase="+token
				resp = requests.get(url)
				exist = False
				if (resp.ok):
					try:
						resp_json = json.loads(resp.content)
						exist = True
					except ValueError:
						exist = False
				word = Word(check_sentence.id, token, exist)
				db.session.add(word)
			db.session.commit()
	sentences = Sentence.query.all()
	c = ((sentence.id, 
		sentence.source, 
		sentence.text, 
		((w.word, w.exist,) for w in sentence.words.all()), 
		) for sentence in sentences)
	return render_template('check.html', rows=c)
コード例 #12
0
ファイル: tweet.py プロジェクト: SocialNPHS/SocialNPHS
def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged
コード例 #13
0
def load_data_and_labels_sam():
    # load
    with open("./input/2780_freshmen_tweets.csv", 'rU') as f:
        rdr = csv.reader(f)
        dataset = list(rdr)[1:]  # remove header

    # filter out tweets with unknown sentiment
    dataset = [entry for entry in dataset if entry[4] != '0']

    # generate x
    tk = TweetTokenizer(reduce_len=True)
    x_text = [entry[3] for entry in dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate y
    y = [entry[4] for entry in dataset]
    for idx, label in enumerate(y):
        if label == '1': # positive
            y[idx] = [1, 0, 0]
        elif label == '2': # neutral
            y[idx] = [0, 1, 0]
        elif label == '3': # negative
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in sam: ' + label

    return [x_text, y]
def preprocess_tweets(event_date, dt=datetime.timedelta(seconds=30),
                      match=None, tweet_processor=None, match_type='home'):
    import collections
    
    tknzr = TweetTokenizer()
    
    dbname = match['dbname']
    collname_home = match['collname_home']
    collname_away = match['collname_away']
    home_team = match['home_team']
    away_team = match['away_team']
    
    if match_type == 'home':
        coll = client[dbname][collname_home]
    else:
        coll = client[dbname][collname_away]

    # add some padding to the start and end times
    date_start = event_date - dt
    date_end = event_date + dt

    query = { "created_at": {"$gt": date_start, "$lt": date_end}}
    
    results = coll.find( query )
    clean_tweets = []
    for result in results:
        tweet_id = result['id_str']
        tweet_split = tweet_processor.preprocess(result['text'].encode('ascii', 'ignore'))
        
        parts = tknzr.tokenize(tweet_split)
        clean = [i for i in parts if i not in stop]
        clean_text = " ".join (clean)
        clean_tweets.append( (clean_text, tweet_id) )
        
    return clean_tweets
コード例 #15
0
ファイル: train_rnn.py プロジェクト: jdbrandon/15780proj
def createDataset(filename, MAX_VOCAB_SIZE):
    yaks = []
    tokenizer = TweetTokenizer()
    ids = set()
    numyaks = 0
    for line in open(filename).readlines():
        stuff = line.split(":::")
        id = stuff[0]
        if len(stuff) > 3 and id not in ids:
            numyaks+=1
            sentence = stuff[3]
            ids.add(id)
            tokens = [START_TOKEN]
            tokens.extend(tokenizer.tokenize(sentence.lower()))
            tokens.append(END_TOKEN)
            yaks.append(tokens)
    token_frequency = nltk.FreqDist(itertools.chain(*yaks))
    vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1)
    i2t = [token[0] for token in vocab]
    i2t.append(UNKNOWN_TOKEN)
    t2i = dict()
    for i,t in enumerate(i2t):
        t2i[t] = i
    
    yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks]
    
    Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks])
    Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks])
    print "Num unique Yaks: "+str(numyaks)
    return (Xtrain, Ytrain, i2t, t2i)
コード例 #16
0
def getTweetTokens(classification, toRead, info, tags):
    i=0
    tknzr = TweetTokenizer()

    with open(toRead) as f:
        content = f.readlines()

    c = 0

    for item in content:
        #adapt the list into python dictionary format
        content[c] = item.replace("null", "None")
        content[c] = content[c].replace("false", "False")
        content[c] = content[c].replace("true", "True")
        c+=1

    for i in range(len(content)):
        tweet = eval(content[i])["text"]
        tokenTweet = tknzr.tokenize(tweet)
        j = 0
        k = 0
        while j < (len(tokenTweet) - k):
            #print j
            if tokenTweet[j][0] == "#":
                tokenTweet[j] = tokenTweet[j][1:]
            elif tokenTweet[j][0] == "@":
                del tokenTweet[j]
                j-=1
                k+=1
            j+=1
            
        info.append((word_feats(tokenTweet), classification))
コード例 #17
0
ファイル: counter.py プロジェクト: redserg/shad-python-hw-3
def main():
    text = sys.stdin.read().decode("utf-8")

    tknzr = TweetTokenizer()
    tok = tknzr.tokenize(text)
    saved_object = construct_dict(tok)
    print json.dumps(saved_object)
コード例 #18
0
def keywords_search(reviews):
    key_map = {}
    # for k in open(os.getcwd() + "/KeyWord/keyword_map_general.txt", 'r'):
    for k in open(keyword_general_path, 'r'):
        a = k.strip().split(", ")
        key_map[a[0]] = a[1]

    special_map = {}
    # for k in open(os.getcwd() + "/KeyWord/keyword_map_special.txt", 'r'):
    for k in open(keyword_special_path, 'r'):
        a = k.strip().split(", ")
        special_map[a[0]] = a[1]

    raw = reviews.lower()
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(raw)

    # remove punctuations
    no_punc_tokens = [i for i in tokens if (not i in string.punctuation+string.digits) and (not "." in i)]

    # remove stop words from tokens
    en_stop = get_stop_words('en')
    stopped_tokens = [i for i in no_punc_tokens if not i in en_stop]

    # stem tokens
    # wordnet_lemmatizer = WordNetLemmatizer()
    # stemmed_tokens = [wordnet_lemmatizer.lemmatize(i) for i in stopped_tokens ] 

    chosen_key_words = []

    # Search in general key word
    key_words_dict = dict.fromkeys(key_map.values(), 0)

    # Select keyword use only key word to select
    # s = set(stemmed_tokens)
    s = set(stopped_tokens)
    for t in key_map.keys():
        if t in s:
            key_words_dict[key_map[t]] += 1

    for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    # Search in special keyword
    special_words_dict = dict.fromkeys(special_map.values(), 0)
    #  Select keyword using wordnet

    # Select keyword use only key word to select
    # s = set(stemmed_tokens)
    s = set(stopped_tokens)
    for t in special_map.keys():
        if t in s:
            special_words_dict[special_map[t]] += 1

    for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    return ' '.join(chosen_key_words)
コード例 #19
0
def get_utterances(utterances, line, category, wgram, cgram):
    tknzr = TweetTokenizer()
    gram_list = []
    # WORD GRAMS
    if wgram == 1:  # unigram
        wgram_list = tknzr.tokenize(line)
    elif wgram == 2:  # uni + bigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, score in scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list
    elif wgram == 3: # uni + bi + trigram
        # unigram list
        tokens = nltk.wordpunct_tokenize(line)
        # bigram list
        bi_finder = BigramCollocationFinder.from_words(tokens)
        bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq)
        bigram_list = sorted(bigram for bigram, biscore in bi_scored)  
        # trigram list
        tri_finder = TrigramCollocationFinder.from_words(tokens)
        tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq)
        trigram_list = sorted(trigram for trigram, triscore in tri_scored)
        # res
        wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list
    
    # CHAR GRAMS
    cgram_list = []
    if cgram == 1:   # uni-chargram
        cgram_list = [line[i:i+1] for i in range(len(line)-1)]
    elif cgram == 2: # bi-chargram
        cgram_list = [line[i:i+2] for i in range(len(line)-1)]
    elif cgram == 3: # tri-chargram
        cgram_list = [line[i:i+3] for i in range(len(line)-1)]
        
    # RESULT
    if category == 'QA':            # non-task
        utterances.append((wgram_list + cgram_list, 0))
    elif category == 'Shopping':    # task
        utterances.append((wgram_list + cgram_list, 1))
    elif category == 'Travel':      # task
        utterances.append((wgram_list + cgram_list, 2))
    elif category == 'Hotel':       # task
        utterances.append((wgram_list + cgram_list, 3))
    elif category == 'Food':        # task
        utterances.append((wgram_list + cgram_list, 4))
    elif category == 'Art':         # task
        utterances.append((wgram_list + cgram_list, 5))
    elif category == 'Weather':     # task
        utterances.append((wgram_list + cgram_list, 6))
    elif category == 'Friends':     # task
        utterances.append((wgram_list + cgram_list, 7))
    elif category == 'Chat':        # chat
        utterances.append((wgram_list + cgram_list, 8))
    else:
        print utt_category,"ERROR"
コード例 #20
0
ファイル: sentiment.py プロジェクト: ruaronicola/TelepathyBot
def classify(classifier, featx, strings):
    print "Classify request"
    tokenizer = TweetTokenizer()
    mood = []
    for string in strings:
        string = Twitter.process_tweet(string)
        tokenized_text = [word.lower() for word in tokenizer.tokenize(string)]
        mood.append(classifier.classify(featx(tokenized_text)))
    return mood
コード例 #21
0
 def get_lyrics(self):
     time.sleep(10)
     soup = BeautifulSoup(self.get_song_page(), 'lxml')
     page_lyric = soup.find_all("div", limit=22)[-1]  # lyrics start on 22nd div
     lyrics = ''.join(page_lyric.find_all(text=True))
     tknzr = TweetTokenizer()
     lyrics = tknzr.tokenize(lyrics)
     lyrics = [word for word in lyrics if word not in self.HTML_TAGS]
     return " ".join(lyrics[20:])
コード例 #22
0
def clean_tweet(tweet):
    tknzr = TweetTokenizer()
    tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet.lower())
    tweet = ' '.join(tweet.split())
    words = tknzr.tokenize(tweet)
    words = [''.join(c for c in s if c not in punctuation) for s in words]
    words = [s for s in words if s]
    sent = " ".join(words)
    return sent
コード例 #23
0
def load_data_and_labels_semeval():
    # load the entire semeval dataset
    old_dataset = list(open("./input/2013-dev"))
    old_dataset.extend(list(open("./input/2013-devtest")))
    old_dataset.extend(list(open("./input/2013-train")))
    old_dataset.extend(list(open("./input/2014-devtest")))

    new_dataset = list(open("./input/2016-train"))
    new_dataset.extend(list(open("./input/2016-dev")))
    new_dataset.extend(list(open("./input/2016-devtest")))

    # filter out invalid tweets from new dataset
    new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n']

    # generate x from old
    tk = TweetTokenizer(reduce_len=True) # handles punctuations
    x_text = [entry.split('\t')[3] for entry in old_dataset]
    x_text = [clean_str(tweet) for tweet in x_text]
    x_text = [tk.tokenize(tweet) for tweet in x_text]

    # generate x from new
    x_text_new = [entry.split('\t')[2] for entry in new_dataset]
    x_text_new = [clean_str(tweet) for tweet in x_text_new]
    x_text_new = [tk.tokenize(tweet) for tweet in x_text_new]

    # concat x and x_new
    x_text.extend(x_text_new)

    # generate y from old
    y = [entry.split('\t')[2] for entry in old_dataset]
    for idx, label in enumerate(y):
        if label == 'positive':
            y[idx] = [1, 0, 0]
        elif label == 'neutral':
            y[idx] = [0, 1, 0]
        elif label == 'negative':
            y[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # generate y from new
    y_new = [entry.split('\t')[1] for entry in new_dataset]
    for idx, label in enumerate(y_new):
        if label == 'positive':
            y_new[idx] = [1, 0, 0]
        elif label == 'neutral':
            y_new[idx] = [0, 1, 0]
        elif label == 'negative':
            y_new[idx] = [0, 0, 1]
        else:
            print 'wrong label in semeval: ' + label

    # concat y and y_new
    y.extend(y_new)

    return [x_text, y]
コード例 #24
0
def loaddata(inputfile):
    file = open(inputfile)
    tknzr = TweetTokenizer()
    sentences=[]
    while 1:
        line = file.readline().strip()
        if not line:
              break
        sentences.append(tknzr.tokenize(line))
    return sentences
コード例 #25
0
ファイル: extract.py プロジェクト: campbeld1/twitterexplorer
def count_tweets_keywords(tweets):
    tknzr = TweetTokenizer()
    wordcounts = defaultdict(int)
    for tweet in tweets:
        if "text" in tweet:
            word_list = tknzr.tokenize(tweet["text"])
            filtered_words = [word for word in word_list if word not in stopwords.words("english")]
            for word in filtered_words:
                wordcounts[word] += 1
    return wordcounts
コード例 #26
0
def tokenize(file_name):
    """ Takes as input a file name. Tokenize the tweets separating them using nltk function.
    Return a list of tokens"""
    tokenizer = TweetTokenizer(strip_handles=True)
    tokens = []
    file = open(file_name, 'r')
    for line in file:
        tokens.append(tokenizer.tokenize(line))
    file.close()
    return tokens
コード例 #27
0
ファイル: build_data.py プロジェクト: TrueSam/shorttext
def get_sentence_from_training_doc_regexp(filename):
  document = get_text_from_training_doc_regexp(filename)
  sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
  sentences = sent_detector.tokenize(document)
  tknzr = TweetTokenizer()
  tokenized_sentences = []
  for i in xrange(len(sentences)):
    tokens = tknzr.tokenize(sentences[i].strip())
    tokenized_sentences.append(' '.join(tokens))
  return tokenized_sentences
コード例 #28
0
def keywords_search(reviews):
    key_map = {}

    for k in open(keyword_general_path, 'r'):
        a = k.strip().split(", ")
        key_map[a[0]] = a[1]

    special_map = {}

    for k in open(keyword_special_path, 'r'):
        a = k.strip().split(", ")
        special_map[a[0]] = a[1]

    # get the tokens from the review
    raw = reviews.lower()
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(raw)

    # remove punctuations
    no_punc_tokens = [i for i in tokens if (not i in string.punctuation + string.digits) and (not "." in i)]

    # remove stop words from tokens
    en_stop = get_stop_words('en')
    stopped_tokens = [i for i in no_punc_tokens if not i in en_stop]

    chosen_key_words = ['chinese']

    # Search in general key word
    key_words_dict = dict.fromkeys(key_map.values(), 0)

    # Select keyword use only key word to select
    s = set(stopped_tokens)
    for t in key_map.keys():
        if t in s:
            key_words_dict[key_map[t]] += 1

    for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    # Search in special keyword
    special_words_dict = dict.fromkeys(special_map.values(), 0)
    #  Select keyword using wordnet

    # Select keyword use only key word to select
    s = set(stopped_tokens)
    for t in special_map.keys():
        if t in s:
            special_words_dict[special_map[t]] += 1

    for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]:
        if d[0] > 0:
            chosen_key_words.append(d[1])

    return ', '.join(chosen_key_words)
コード例 #29
0
def preprocess_docs(documents):
    tokenizer = TweetTokenizer()
    english_stemmer = nltk.stem.SnowballStemmer('english')

    texts = [tokenizer.tokenize(d) for d in documents]

    stemmed_texts = []
    for text in texts:
        stemmed_text = [english_stemmer.stem(t) for t in text]
        stemmed_texts.append(stemmed_text)
    return stemmed_texts
コード例 #30
0
ファイル: test_tokenize.py プロジェクト: Copper-Head/nltk
    def test_tweet_tokenizer(self):
        """
        Test TweetTokenizer using words with special and accented characters.
        """

        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
        s9 = "@myke: Let's test these words: resumé España München français"
        tokens = tokenizer.tokenize(s9)
        expected = [':', "Let's", 'test', 'these', 'words', ':', 'resumé',
                    'España', 'München', 'français']
        self.assertEqual(tokens, expected)
コード例 #31
0
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize

line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\
                          " Software industry working \nfrom applications to products by using \n" \
                          " C, C++, Java, Javascript and databases "\
                          " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB."

lTokenizer = LineTokenizer()
print("Line tokenizer output: ", lTokenizer.tokenize(line))

sTokenizer = SpaceTokenizer()
print("Space Tokenizer output: ", sTokenizer.tokenize(line))

print("Word Tokenizer output: ", word_tokenize(line))

tTokenizer = TweetTokenizer()
print("Tweet Tokenizer output: ",
      tTokenizer.tokenize("This is a coooool #dummysmiley: :-) :-P <3"))
コード例 #32
0
class Processing:
    def __init__(self, read_n_write):
        self.happy_emoticons = read_n_write.read_any_list(
            './resources/happy_emoticons.txt')
        self.sad_emoticons = read_n_write.read_any_list(
            './resources/sad_emoticons.txt')
        self.slang = read_n_write.read_any_list('./resources/slang.txt')
        self.wnl = nltk.WordNetLemmatizer()
        self.tokenizer = TweetTokenizer()
        self.stop_words = set(stopwords.words('english'))
        self.stop_words.update([
            'url', "i'm", '@name', "@name's", "that's", "doesn't", 'u',
            'would', 'else', 'anyone', "can't", "what's", "i've", 'could',
            "they're"
        ])

        self.happy_emoticons_count = 0
        self.sad_emoticons_count = 0
        self.emoji_count = 0
        self.slang_count = 0
        self.stopwords_count = 0
        self.emoji_list = []
        self.emoji_list.extend(self.happy_emoticons)
        self.emoji_list.extend(self.sad_emoticons)

        self.punc_list = set(string.punctuation)

        self.ngrams = {}
        self.ngrams_pos = {}

    def write_unk_emoji(self):
        print('Printing not emotion annotated emoticon list')
        na_emoticon = list(
            set(self.emoji_list) -
            set(self.happy_emoticons).union(set(self.sad_emoticons)))
        print(na_emoticon)
        for emoticon in na_emoticon:
            print(emoticon + '\t' + self.emoji_dict[emoticon])

    def process(self, tweets):
        self.happy_emoticons_count = 0
        self.sad_emoticons_count = 0
        self.emoji_count = 0
        self.slang_count = 0
        self.stopwords_count = 0
        list_tweets, list_tweet_lemmas = self.processing_pos(tweets)
        final_tweet_lemmas = []
        final_tweets = []
        for list_token, list_lemmas in zip(list_tweets, list_tweet_lemmas):
            single_tweet_lemma = []
            single_tweet = []
            for word, lemma in zip(list_token, list_lemmas):
                if self.all_count(word, lemma):
                    word = word.lower()
                    single_tweet.append(word)
                    single_tweet_lemma.append(lemma)
            final_tweet_lemmas.append(single_tweet_lemma)
            final_tweets.append(single_tweet)
        final_features = [
            self.happy_emoticons_count, self.sad_emoticons_count,
            self.emoji_count, self.slang_count, self.stopwords_count
        ]
        return final_features, final_tweets, final_tweet_lemmas

    def all_count(self, word, lemma):
        if word.strip() == '':
            return False
        emoji_flag = False
        try:
            if word in emoji.UNICODE_EMOJI or word in self.emoji_list:
                emoji_flag = True
                new_emoji = emoji.demojize(word)
                self.emoji_count += 1
                if not new_emoji in self.emoji_list:
                    self.emoji_list.append(new_emoji)
                if new_emoji in self.happy_emoticons:
                    self.happy_emoticons_count += 1
                elif new_emoji in self.sad_emoticons:
                    self.sad_emoticons_count += 1
        except:
            pass
        if word in self.slang or lemma in self.slang:
            self.slang_count += 1
        if word in self.stop_words or lemma in self.stop_words:
            self.stopwords_count += 1
            return False
        if word in self.punc_list:
            return False
        if emoji_flag:
            return False
        return True

    def processing_pos(self, tweets):
        list_lemmas = []
        list_tokens = []
        for tweet in tweets:
            words = []
            lemmas = []
            for word in self.tokenizer.tokenize(tweet):
                lemma = self.wnl.lemmatize(word.lower())
                if word in self.stop_words or word in self.punc_list or lemma in self.stop_words:
                    continue
                else:
                    lemmas.append(lemma)
                    words.append(word)
            list_lemmas.append(lemmas)
            list_tokens.append(words)
        return list_tokens, list_lemmas

    def processing_lemma(self, list_of_sent):
        list_output = []
        for sent in list_of_sent:
            words = []
            for word in self.tokenizer.tokenize(sent):
                word = self.wnl.lemmatize(word.lower())
                if word in self.stop_words or word in self.punc_list:
                    continue
                else:
                    words.append(word)
            list_output.append(words)
        return list_output
コード例 #33
0
ファイル: settings.py プロジェクト: sardarr/Myweb
Django settings for LCBweb project.

Generated by 'django-admin startproject' using Django 2.1.

For more information on this file, see
https://docs.djangoproject.com/en/2.1/topics/settings/

For the full list of settings and their values, see
https://docs.djangoproject.com/en/2.1/ref/settings/
"""
from nltk.tokenize import TweetTokenizer

import os
from home.beliefEng.Belief_tagger import modelLoader
MODEL = modelLoader()
TOKENIZER = TweetTokenizer()

# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = '!q+5d6*yxnlrb-f5@n%9__c!gw&zf4mw9y+)drcodbq1@q@71$'

# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True

ALLOWED_HOSTS = []
コード例 #34
0
import CRF.definitions as definitions
from sklearn.model_selection import StratifiedKFold
from CMUTweetTagger import runtagger_parse
from spacy.language import Tokenizer, GoldParse
from spacy.tokenizer import Tokenizer
from spacy.attrs import ORTH, LEMMA
import spacy
from sklearn.linear_model import SGDClassifier

nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)

lancaster_stemmer = LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
tknzr = TweetTokenizer(preserve_case=True,
                       strip_handles=False,
                       reduce_len=False)
stop = set(stopwords.words('english'))


def get_tuples(dspath):
    sentences = []
    s = ''
    tokens = []
    ners = []
    poss = []
    tot_sentences = 0
    ners_by_position = []
    index = 0
    with open(dspath) as f:
        for line in f:
コード例 #35
0
    totRepetitions = 0
    d = collections.defaultdict(int)
    for c in tw:
        d[c] += 1
    for c in sorted(d, key=d.get, reverse=True):
        if d[c] > 1:
            totRepetitions = totRepetitions + d[c]
    wordLength = sum(1 for c in tw)
    repPercent = totRepetitions / wordLength
    return repPercent


#client = corenlp.CoreNLPClient ( start_server=False , annotators="sentiment".split ( ) )

#Preprocessing and Tokenization
tk = TweetTokenizer()
p = Preprocess()
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=[
        'email', 'percent', 'money', 'phone', 'time', 'url', 'date', 'number'
    ],
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    dicts=[emoticons])

#REPLACE with SPECIAL TAGS
コード例 #36
0
import re
import csv
import time
import os, sys, codecs
from nltk.text import Text
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
import time

start = time.time()

lemmatizer = WordNetLemmatizer()

tweet = TweetTokenizer(strip_handles=True)
# Setting Stopwords
stop_words = set(stopwords.words('english'))
# Updating stop words with punctuation
stop_words.update([
    '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '/',
    '-', '~', '&', '*', '<', '>', '=', '%'
])
# updating stopwords with links
stop_words.update(['http', 'httpbitly', 'httptinyurl', '://'])
# updating stopwords with Expressions and words of no impact
stop_words.update([
    '่', 'ã€', 'ã€', '。', 'ã€', 'é', '|', 'ï¼', '…', '’', '่', '^',
    ',', ')', '้', 'ั', '#p2', '。', '’', '#tcot', 'ั', 'ã€', '่',
    'via', '、'
])
コード例 #37
0
class MultimodalPreprocessor:
    log = logging.getLogger("MulitmodalPreprocessor")

    def __init__(self, max_dict_size=MM_MAX_DICT_SIZE):
        self.max_dict_size = max_dict_size
        self.token_to_id = {TOKEN_UNK: 0}
        self.next_id = 1
        self.tokenizer = TweetTokenizer(preserve_case=True)

    def __len__(self):
        return len(self.token_to_id)

    def __call__(self, batch, cuda=False, device_id=None):
        """
        Convert list of multimodel observations (tuples with image and text string) into the form suitable
        for ModelMultimodal to disgest
        :param batch:
        """
        tokens_batch = []
        for img_obs, txt_obs in batch:
            tokens = self.tokenizer.tokenize(txt_obs)
            idx_obs = self.tokens_to_idx(tokens)
            tokens_batch.append((img_obs, idx_obs))
        # sort batch decreasing to seq len
        tokens_batch.sort(key=lambda p: len(p[1]), reverse=True)
        img_batch, seq_batch = zip(*tokens_batch)
        lens = list(map(len, seq_batch))

        # convert data into the target form
        # images
        img_v = Variable(torch.from_numpy(np.array(img_batch)))
        # sequences
        seq_arr = np.zeros(shape=(len(seq_batch), max(len(seq_batch[0]), 1)),
                           dtype=np.int64)
        for idx, seq in enumerate(seq_batch):
            seq_arr[idx, :len(seq)] = seq
            # Map empty sequences into single #UNK token
            if len(seq) == 0:
                lens[idx] = 1
        seq_v = Variable(torch.from_numpy(seq_arr))
        if cuda:
            img_v = img_v.cuda(device_id=device_id)
            seq_v = seq_v.cuda(device_id=device_id)
        seq_p = rnn_utils.pack_padded_sequence(seq_v, lens, batch_first=True)
        return img_v, seq_p

    def tokens_to_idx(self, tokens):
        res = []
        for token in tokens:
            idx = self.token_to_id.get(token)
            if idx is None:
                if self.next_id == self.max_dict_size:
                    self.log.warning(
                        "Maximum size of dict reached, token '%s' converted to #UNK token",
                        token)
                    idx = 0
                else:
                    idx = self.next_id
                    self.next_id += 1
                    self.token_to_id[token] = idx
            res.append(idx)
        return res

    def save(self, file_name):
        with open(file_name, 'wb') as fd:
            pickle.dump(self.token_to_id, fd)
            pickle.dump(self.max_dict_size, fd)
            pickle.dump(self.next_id, fd)

    @classmethod
    def load(cls, file_name):
        with open(file_name, "rb") as fd:
            token_to_id = pickle.load(fd)
            max_dict_size = pickle.load(fd)
            next_id = pickle.load(fd)

            res = MultimodalPreprocessor(max_dict_size)
            res.token_to_id = token_to_id
            res.next_id = next_id
            return res
コード例 #38
0
    TASK = "A"  # Define, A or B
    FNAME = './predictions-task' + TASK + '.txt'
    PREDICTIONSFILE = open(FNAME, "w")

    K_FOLDS = 10  # 10-fold crossvalidation
    CLF = LinearSVC()  # the default, non-parameter optimized linear-kernel SVM

    # Loading dataset and featurised simple Tfidf-BoW model
    corpus, y = parse_dataset(DATASET_FP)
    X, vectorizer = featurize(corpus)

    class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist()
    print(class_counts)

    print(corpus)
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    tokens = tokenizer('\n'.join(corpus))
    finder = BigramCollocationFinder.from_words(tokens)
    bigram_measures = BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.student_t)
    sorted(bigram for bigram, score in scored)
    map(lambda x: print(' '.join(x[0]), x[1]), scored[:10])

    CLF.fit(X, y)

    # Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated
    predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS)

    most_informative_feature_for_binary_classification(vectorizer, CLF, n=10)

    # Modify F1-score calculation depending on the task
コード例 #39
0
class SocialTextProcessor(AbstractDataProcessor):
    english_stopwords = english_stopwords.words()
    english_dictionary = dict.fromkeys(nltk_words.words(), None)
    my_stopwords = ["still", "just", "emoji", "open", "go", "coin", "see"]
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    t_word_tokenizer = TweetTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()

    @classmethod
    def process_document(cls, document):

        document = cls.html_processing(document)

        #tokenize
        words = cls.t_word_tokenizer.tokenize(document)

        #print(" \n Tokenizing: {} \n".format(words))
        #expand contractions
        words = cls.expand_contractions(words)
        #print("Expanding contractions: {} \n".format(words))

        # to lowercase
        words = list(map(str.lower, words))

        tagged_sentence = pos_tag(words)
        proper_nouns_tags = ['IN', 'NNP', 'PRP', 'PRP$', 'WP$']
        tagged_sentence = [(word, tag) for word, tag in tagged_sentence
                           if tag not in proper_nouns_tags]

        #print("Filtering tags: {} \n".format(tagged_sentence))

        words = []
        for word, tag in tagged_sentence:
            wordnet_tag = cls.find_wordnet_tag(tag)
            if wordnet_tag != '':
                word = cls.remove_apos(word)
                words.append(
                    cls.lemmatizer.lemmatize(word.lower(), wordnet_tag))
            elif word in string.punctuation:
                words.append(word)

        #print("Lemmatize: {} \n".format(words))
        # must be reviewed
        words = [
            word for word in words if word not in string.punctuation
            and len(word) > 1 and cls.is_english_word(word.lower())
        ]
        #print("Punctuation and english: {} \n".format(words))

        words = mark_negation(words)
        #print("Negation: {} \n".format(words))

        stop_wrods = set(cls.english_stopwords + cls.my_stopwords)
        words = [word for word in words if word.lower() not in stop_wrods]

        #print("Stop words: {} \n".format(words))

        return words

    @classmethod
    def remove_apos(cls, text):
        while "'" in text:
            text = text.replace("'", "")
        return text

    @classmethod
    def expand_contractions(cls, words):
        expanded_words = []
        for word in words:
            if word.lower() in CONTRACTION_MAP.keys():
                expanded_words += word_tokenize(CONTRACTION_MAP[word.lower()])
            else:
                expanded_words.append(word)
        return expanded_words

    @classmethod
    def html_processing(cls, text):
        # remove urls
        text = re.sub(r"http\S+", ' ', text)

        # remove #
        text = re.sub(r'#(\S+)', r' \1 ', text)

        # remove digits
        text = re.sub(pattern=r"\d", repl=r"", string=text)

        # replace users, tags with empty space
        text = re.sub(r'@[\S]+', ' ', text)

        # Replace #word with empty space
        text = re.sub(r'#([^\s]+)', ' ', text)

        # remove duplicated characters
        text = re.sub(r'\s+', ' ', text)

        text = re.sub(r'(.)\1+', r'\1\1', text)

        return text

    @classmethod
    def find_wordnet_tag(cls, tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    @classmethod
    def is_english_word(cls, word):
        try:
            cls.english_dictionary[word]
            return True
        except KeyError:
            return False
コード例 #40
0
    def new_oracle_data(self):

        print("Creating New " + self.data_file_name + " File.")

        path = os.path.join(self.data_dir, self.data_file)
        tknzr = TweetTokenizer(preserve_case=False)
        oracle_data = dict()
        _id = 0

        ans2tok = {'Yes': 1, 'No': 0, 'N/A': 2}

        with gzip.open(path) as file:
            for json_game in file:
                game = json.loads(json_game.decode("utf-8"))

                if self.successful_only:
                    if not game['status'] == 'success':
                        continue

                if self.history:
                    prev_ques = list()
                    prev_answer = list()
                    prev_length = 0
                for i, qa in enumerate(game['qas']):
                    q_tokens = tknzr.tokenize(qa['question'])
                    q_token_ids = [
                        self.word2i[w]
                        if w in self.word2i else self.word2i['<unk>']
                        for w in q_tokens
                    ][:self.max_src_length]
                    a_token = ans2tok[qa['answer']]

                    length = len(q_token_ids)

                    if self.history:
                        question = prev_ques + prev_answer + q_token_ids
                        question_length = prev_length + length
                    else:
                        question = q_token_ids
                        question_length = length

                    if self.history:
                        question.extend([self.word2i['<padding>']] *
                                        (self.max_diag_len - len(question)))
                    else:
                        question.extend([self.word2i['<padding>']] *
                                        (self.max_src_length - len(question)))

                    for i, o in enumerate(game['objects']):
                        if o['id'] == game['object_id']:
                            # target object information
                            spatial = get_spatial_feat_v2(
                                bbox=o['bbox'],
                                im_width=game['image']['width'],
                                im_height=game['image']['height'])
                            object_category = o['category_id']
                            break

                    oracle_data[_id] = dict()
                    oracle_data[_id]['question'] = question
                    oracle_data[_id]['length'] = question_length
                    oracle_data[_id]['answer'] = a_token
                    oracle_data[_id]['image_file'] = game['image']['file_name']
                    oracle_data[_id]['spatial'] = spatial
                    oracle_data[_id]['game_id'] = str(game['id'])
                    oracle_data[_id]['obj_cat'] = object_category

                    prev_ques = copy.deepcopy(q_token_ids)
                    prev_answer = [copy.deepcopy(a_token)]
                    prev_length = length + 1

                    _id += 1

        oracle_data_path = os.path.join(self.data_dir, self.data_file_name)
        with io.open(oracle_data_path, 'wb') as f_out:
            data = json.dumps(oracle_data, ensure_ascii=False)
            f_out.write(data.encode('utf8', 'replace'))

        print('done')

        with open(oracle_data_path, 'r') as file:
            oracle_data = json.load(file)

        return oracle_data
コード例 #41
0
    inv_target_dict[i] = tar
    i += 1

x = set()
with open("../train dataset/Stance.csv", "rb") as f:
    for row in f:
        x.add(row.strip())
x = list(x)
i = 0
for tar in x:
    stance_dict[tar] = i
    inv_stance_dict[i] = tar
    i += 1

# print target_dict,stance_dict
tknzr = TweetTokenizer()
x_train, y_train = [[] for i in range(5)], [[] for i in range(5)]
X_train, Y_train = [[] for i in range(5)], [[] for i in range(5)]

with open("../train dataset/Tweet.csv",
          "rb") as f1, open("../train dataset/Target.csv",
                            "rb") as f2, open("../train dataset/Stance.csv",
                                              "rb") as f3:
    for l1, l2, l3 in izip(f1, f2, f3):

        tweet = tknzr.tokenize(l1.strip())
        x_train[target_dict[l2.strip()]].append(tweet)
        y_train[target_dict[l2.strip()]].append(l3.strip())

x_dev, y_dev = [[] for i in range(5)], [[] for i in range(5)]
X_dev, Y_dev = [[] for i in range(5)], [[] for i in range(5)]
def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    return [word for word in tokens if word not in stopwords and not word.isdigit()]
コード例 #43
0
def preprocessing(document_body):
    tokenizer = TweetTokenizer()
    token_list = tokenizer.tokenize(document_body)
    token_list = [str for str in token_list if str != '.']

    return token_list
コード例 #44
0
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, optimizers

import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import tokenize

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import preprocessing

tknzr = TweetTokenizer()
stop_set = set(stopwords.words('english') + list(string.punctuation))

def preprocessing_tweet(folder, category, all_text, all_time, all_freq, label, reviews):
    count = 0
    for filename in os.listdir(folder):
#        if count > 10:
#            break
#        count +=1
        
        sentences = []
        time = []
        freq = []
        if os.path.isdir(os.path.join(folder, filename)):
            path = folder + '/' + filename + '/tweets.json'
            tweets_data = []
コード例 #45
0
ファイル: similarity.py プロジェクト: lorabit/MiningReasons
import enchant
from nltk.stem.porter import *
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

t = TweetTokenizer()
d = enchant.Dict("en_US")
stemmer = PorterStemmer()
stopword = set(stopwords.words('english'))


def tokenize(text):
    ret = t.tokenize(text)
    return ret


def stem(text):
    ret = []
    for word in tokenize(text):
        word = word.lower()
        if not d.check(word):
            continue
        if word in stopword:
            continue
        word = stemmer.stem(word)
        ret += [word]
    return ret


def similarity(candidate1, candidate2):
    set1 = set(stem(candidate1))
コード例 #46
0
 def __init__(self, max_dict_size=MM_MAX_DICT_SIZE):
     self.max_dict_size = max_dict_size
     self.token_to_id = {TOKEN_UNK: 0}
     self.next_id = 1
     self.tokenizer = TweetTokenizer(preserve_case=True)
コード例 #47
0
def isExtroverted(s):
    print(s)
    tempL = ['ESTP', 'ESTJ', 'ESFP', 'ESFJ', 'ENTP', 'ENTJ', 'ENFP', 'ENFJ']
    ret = []
    for i in s:
        if i in tempL:
            ret.append(True)
        else:
            ret.append(False)
    return ret


fil = list(csv.reader(open('mbti_big5scores.csv')))
vocabFile = open('top500vocab.txt', 'r')
tknzr = TweetTokenizer()
#sentAn = SentimentIntensityAnalyzer() #sentiment analyzer
lancaster = LancasterStemmer()  #PorterStemmer()
wordnetlem = WordNetLemmatizer()
countVect = CountVectorizer()
saver = tf.train.Saver()

vocab = set()
stopWords = set(stopwords.words('english'))
features = {}
textTrack = {}
puncts = set(string.punctuation)

#Neural network statistics
dispEpoch = 2
saveEveryNEpochs = 5
コード例 #48
0
# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"(@\w+|#\w+)"
tweets.append('some of @my_story placed and some #tag')
# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[-1], pattern2)
print(mentions_hashtags)

# Import the necessary modules
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)

german_text = 'Wann gehen wir Pizza essen? 🍕 Und fährst du mit Über? 🚕'
# Tokenize and print all words in german_text
all_words = word_tokenize(german_text)
print(all_words)

# # Tokenize and print only capital words
capital_words = r"[A-Z\Ü]\w+"
print(regexp_tokenize(german_text, capital_words))

# Tokenize and print only emoji
emoji = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
print(regexp_tokenize(german_text, emoji))
コード例 #49
0
def represent_tweet(tweets ):
    tokens = TweetTokenizer().tokenize(tweets)
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency
コード例 #50
0
#  python library imports
import string
import re
import nltk
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm")

TKNZ = TweetTokenizer()
LEM = WordNetLemmatizer()
STOP_WORDS = stopwords.words('english')


def filter_tweets(tweets_list, expr):
    output = []

    try:  # list of tweets
        for tweet in tweets_list:
            text = tweet['text']
            found = re.search(expr, text)
            if found:
                output.append(tweet['text'])

    except:  # list of tweet 'text' body
        for text in tweets_list:
            found = re.search(expr, text)
            if found:
                output.append(text)
コード例 #51
0
# -*- coding: utf-8 -*-
"""
Created on Fri Sep  6 21:56:45 2019

@author: Lakshay Dhiman
"""
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
import pandas as pd
from nltk.corpus import wordnet
import random
dataset = pd.read_csv('tweets-dataset.csv')
x = dataset.iloc[:, :].values
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
y = set()
t = 0
for i in range(len(x)):
    p = tknzr.tokenize(x[i][0])
    for j in range(len(p)):
        if (p[j] != '?' and p[j] != '!' and p[j] != '.' and p[j] != ','):
            y.add(p[j])
            t = t + 1
z = list(y)
p = []
for i in range(40000):
    h = random.randint(0, len(z) - 1)
    syn = []
    for j in wordnet.synsets(z[h]):
        for k in j.lemmas():
            if (z[h] == k.name()):
                syn.append(k.name())
コード例 #52
0
class StreamProcess(KafkaConsumer):
    def __init__(self, *args, **kwargs):
        self.broker = kwargs['bootstrap_servers']
        self._classifier_filepath = kwargs.pop('classifier_filepath', None)
        self.influxdb_host = kwargs.pop('influxdb_host', 'localhost')
        self.influxdb_port = kwargs.pop('influxdb_port', 8086)
        self.influxdb_database = kwargs.pop('influxdb_database', None)

        super().__init__(*args, **kwargs)

        self._stopwords = stopwords.words('english')

        with open(self._classifier_filepath, 'rb') as f:
            self._classifier = pickle.load(f)

        self._word_tokenizer = TweetTokenizer(preserve_case=True,
                                              reduce_len=False,
                                              strip_handles=False)

        self._lemmatizer = WordNetLemmatizer()

        self.influxdb_client = InfluxDBClient(host=self.influxdb_host,
                                              port=self.influxdb_port,
                                              username='******',
                                              password='******',
                                              database=self.influxdb_database)
        self.influxdb_client.create_database(self.influxdb_database)

    def process(self):
        try:
            message = self.__next__()
            tweet = message.value.decode('utf-8').strip()

            polarity = self._classify(tweet)

            wrapper = '+' if polarity == 'Positive' else '-'

            data_point = [{
                # "timestamp":
                "measurement": "sentiments",
                "tags": {
                    "language": "en",
                    "polarity": polarity
                },
                "fields": {
                    "tweet": tweet
                }
            }]

            if self.influxdb_client.write_points(data_point):
                logging.info("DB SUCCESSFUL")
            else:
                logging.info("DB FAILED")

            # logging.info(message.offset)

        except StopIteration as e:
            logging.warning(
                "No incoming message found at Kafka broker: {}.".format(
                    self.broker))
            return

    def _tokenize(self, tweet):
        return self._word_tokenizer.tokenize(tweet)

    def _is_noise(self, word):
        pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(@[A-Za-z0-9_]+)'
        return word in string.punctuation \
            or word.lower() in self._stopwords \
            or re.search(pattern, word, re.IGNORECASE) != None

    def _tag2type(self, tag):
        """
        Take a tag and return a type
        Common tags are:
            - NNP: Noun, proper, singular
            - NN: Noun, common, singular or mass
            - IN: Preposition or conjunction, subordinating
            - VBG: Verb, gerund or present participle
            - VBN: Verb, past participle

        return 'n' for noun, 'v' for verb, and 'a' for any
        """
        if tag.startswith('NN'):
            return 'n'
        elif tag.startswith('VB'):
            return 'v'
        else:
            return 'a'

    def _lemmatize(self, tokens):
        return [
            self._lemmatizer.lemmatize(word, self._tag2type(tag)).lower()
            for word, tag in pos_tag(tokens) if not self._is_noise(word)
        ]

    def _classify(self, tweet):
        tokens = self._lemmatize(self._tokenize(tweet))
        return self._classifier.classify(
            dict([token, True] for token in tokens))
コード例 #53
0
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv,
                                label='neg',
                                word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv,
                                label='pos',
                                word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats,
                                       bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            'Your classifier does not provide a show_most_informative_features() method.'
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output,
                        Dataset='labeled_tweets',
                        Classifier=type(classifier).__name__,
                        Tokenizer=tokenizer.__class__.__name__,
                        Feats=extr,
                        Results=results,
                        Instances=n_instances)
コード例 #54
0
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)

# remove hashtags
# only removing the hash # sign from the word
tweet2 = re.sub(r'#', '', tweet2)

print(tweet2)

# Tokenize the string
print()
print('\033[92m' + tweet2)
print('\033[94m')

# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False,
                           strip_handles=True,
                           reduce_len=True)

# tokenize tweets
tweet_tokens = tokenizer.tokenize(tweet2)

print()
print('Tokenized string:')
print(tweet_tokens)

# Remove stop words and punctuations
# Import the english stop words list from NLTK
stopwords_english = stopwords.words('english')

print('Stop words\n')
print(stopwords_english)
コード例 #55
0
def get_tags(tweet):
    return [
        word for word in TweetTokenizer().tokenize(tweet)
        if word.startswith('#')
    ]
コード例 #56
0
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: remove_squarebrackets(tw))
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: remove_nonunicode(tw))

df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: remove_symbols(tw))
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: tw.lower())
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: reduce_lengthening(tw))

# tokenization
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: TweetTokenizer().tokenize(tw))

#df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(lambda tw : spell_correction(tw))
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: remove_numbers(tw))
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: normalize_slangs(tw))
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: remove_stopwords(tw))
df['preprocessed_tweet'] = df['preprocessed_tweet'].apply(
    lambda tw: stemming(tw))

#df.to_csv("trainset_preprocessed.csv")
"""## Classification

### Stratified sampling
コード例 #57
0
def tokenize(tweet):
    tknzr = TweetTokenizer(strip_handles=True,
                           reduce_len=True,
                           preserve_case=False)
    return tknzr.tokenize(tweet)
コード例 #58
0
ファイル: qa-system.py プロジェクト: harsimratK/NLP
def TokenWords(My_sent):
	tokenizer_words = TweetTokenizer()
	tokens_words = [tokenizer_words.tokenize(t) for t in My_sent]

	return tokens_words
db=conn[db_name]
# collection
colection = db.tweets
# query: find all documents
results = colection.find()
# close the mongoDB connection
conn.close()
# convert the results to a list
list_results=list(results)
# print the time and the text
for record in list_results:
    print 'At %s: \t %s.'% (record['time'],record['text'])

# *** word frequency mining ****
# tokenizer
tweet_tokenizer = TweetTokenizer()
# punctuation list
punct = list(string.punctuation)
# download 127 Englisg stop words
import nltk
nltk.download('stopwords')
# list of stop words and punctuations
stopword_list = stopwords.words('english') + punct + ['rt', 'via']

# record the number of occurences for each word
tf = Counter()
all_dates = []

# get the text and the time
for element in list_results:
    message = element['text']
コード例 #60
0
#unsupervised learning:
#see if clusters exist between different types of hate speech
#use tf-idf, dimensionality reduction, then clustering algorithm to separate
#if clusters of hate speech don't exist, try and find other clusters with other datasets

#priorities:
#1: unsupervised learning and dimensionality reduction (try TSNE https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)
#2: PCA, cross-validation error, other classifiers and encodings

#data processing
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(reduce_len=True)
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from joblib import dump, load
import os.path
from os import path

#get pd dataframe for training data
df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv",
                      names=('id', 'label', 'tweet'),
                      header=None)


def processTweet(tweet):
    tokens = tknzr.tokenize(tweet[0:-1])