def clean_tokenize_stem_tweets_baseline(raw_tweets, token_file, tagging_file, force_new=False): if os.path.isfile(token_file) and os.path.isfile(tagging_file) and not force_new: print('reading tokens and tagging results') with open(token_file, 'rb') as inf: token_lists = pickle.load(inf) with open(tagging_file, 'rb') as inf: tagging_results = pickle.load(inf) if len(token_lists) == raw_tweets.shape[0] and len(tagging_results) == raw_tweets.shape[0]: print('tokens and tagging results are correct') return token_lists, tagging_results print('preform new tokenization and tagging') # or create new one stemmer = PorterStemmer() url_compiled = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') mention_compiled = re.compile(r'@\w*') html_compiled = re.compile(r"&#?\w+;") tweets_cleaned = [html_compiled.sub(' ', tweet) for tweet in raw_tweets] tweets_cleaned = [url_compiled.sub(' ', tweet) for tweet in tweets_cleaned] tweets_cleaned = [mention_compiled.sub(' ', tweet) for tweet in tweets_cleaned] tagging_results = CMUTweetTagger.runtagger_parse(tweets_cleaned) token_lists = [[stemmer.stem(word_result[0]) for word_result in tweet_result] for tweet_result in tagging_results] # save to file with open(token_file, 'wb') as outf: pickle.dump(token_lists, outf) with open(tagging_file, 'wb') as outf: pickle.dump(tagging_results, outf) return token_lists, tagging_results
def boost_df(X): boost_entity = {} boosted_wdfVoc = {} Xdense = np.matrix(X).astype('float') X_scaled = preprocessing.scale(Xdense) X_normalized = preprocessing.normalize(X_scaled, norm='l2') vocX = vectorizer.get_feature_names() pos_tokens = CMUTweetTagger.runtagger_parse( [term.upper().encode() for term in vocX]) for l in pos_tokens: term = '' for gr in range(0, len(l)): term += l[gr][0].lower() + " " if "^" in str(l): boost_entity[term.strip()] = 2.5 else: boost_entity[term.strip()] = 1.0 dfX = X.sum(axis=0) keys = vocX vals = dfX for k, v in zip(keys, vals): dfVoc[k] = v for k in dfVoc: try: boosted_wdfVoc[k] = dfVoc[k] * boost_entity[k] except: boosted_wdfVoc[k] = dfVoc[k] return X_normalized, boosted_wdfVoc
def pos_tagger(self): tweets = [] for tw in self.tweet_original: try: tw = tw.decode('unicode_escape').encode('ascii','ignore') except: tw = re.sub(r'\\+', '', tw) tw = tw.decode('unicode_escape').encode('ascii','ignore') tweets.append(tw) # tweets = [tw.encode('utf8') for tw in self.tweet_original[:3]] sent_tags = CMUTweetTagger.runtagger_parse(tweets) # fil_tweet = open('tweet_tags.json','w') i = 0 for sent in sent_tags: unigrams = [tag_tuple[1] for tag_tuple in sent] bigrams = set(nltk.bigrams(unigrams)) trigrams = set(nltk.trigrams(unigrams)) self.tweet_unigram[self.tweet_id[i]] = set(unigrams) self.tweet_bigram[self.tweet_id[i]] = bigrams self.tweet_trigram[self.tweet_id[i]] = trigrams self.tweet_feature_list.extend(unigrams) self.tweet_feature_list.extend(bigrams) self.tweet_feature_list.extend(trigrams) i += 1 #json.dump(self.tweet_unigram,fil_tweet) self.tweet_feature_list = list(set(self.tweet_feature_list))
def genPOSTags(text_path, pos_path, pos_corpus_file, verbose=True): with open(os.path.join(param.dump_folder, text_path), "rb") as handle: comment_list, _ = pickle.load(handle) # tokenize #comment_list = genTokens(comment_list) # POS tagging pos_comment_list = [] max_sent_num = 5000 ind = 0 while (ind < len(comment_list)): sent_list = comment_list[ind:ind + max_sent_num] #[" ".join(seq) for seq in comment_list[ind: ind+max_sent_num]] tok_sent_list = [ sent.lower() for sent in CMUTweetTokenizer.runtokenizer_parse(sent_list) ] raw_pos_list = CMUTweetTagger.runtagger_parse(tok_sent_list) pos_list = [] for raw_seq in raw_pos_list: seq = [tup[1] for tup in raw_seq] pos_list.append(seq[:]) pos_comment_list = pos_comment_list + pos_list ind += max_sent_num with open(os.path.join(param.dump_folder, pos_path), "wb") as handle: pickle.dump(pos_comment_list, handle) if text_path.endswith("comm.data"): pos_corpus_file.write("\n".join( [" ".join(pos_comm) for pos_comm in pos_comment_list])) pos_corpus_file.write("\n") if verbose: print("# pos sequences:", len(pos_comment_list)) for i in range(3): print("example of pos sequence:", pos_comment_list[i]) print("save pos to {}".format(pos_path))
def update_edges_tag(database): tweets = [u"someone is cold game nd he needs to follow me", u"only 3mths left in school . i wil always mis my skull , frnds and my teachrs"] lot = CMUTweetTagger.runtagger_parse(tweets) norm = normalizer.Normalizer(lot,database) # tags = norm.nodes.distinct('tag') tags = [u'A', u'N', u'^', u'V', u'!', u'O', u'G', u'S', u'R', u',', u'P', u'Z', u'L', u'D', u'&', u'T', u'X', u'Y', u'M'] for tag in tags: nouns = [node['node'] for node in filter(lambda x: x['freq']> 8, norm.nodes.find({'tag':tag}))] norm.edges.update({'from': { '$in' : nouns}},{'$set' : {u'from_tag':tag } },multi=True) norm.edges.update({'to': { '$in' : nouns}},{'$set' : {u'to_tag':tag } },multi=True)
def parse_words_by_ark_nlp_batch(self, tweets, preserve_types, ark_run_cmd): token_lists = CMUTweetTagger.runtagger_parse(tweets, run_tagger_cmd=ark_run_cmd) ret = [] for tokens in token_lists: filtered_tokens = self.filter_by_type(tokens, preserve_types) words = self.parse_tokenized_words(filtered_tokens) ret.append(words) return ret
def pos_tag_tweets(tweet_list,pkl_file = 'pickle/pos_tweets.pkl'): tagged_tweets = CMUTweetTagger.runtagger_parse(tweet_list, run_tagger_cmd=tweet_stack_command) if pkl_file != None: serialize_object(tagged_tweets,pkl_file) return tagged_tweets
def boost_entities(features): boost_entity = {} pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in features]) for line in pos_tokens: term ='' for entity in range(len(line)): term += line[entity][0].lower() + " " if "^" in str(line): boost_entity[term.strip()] = 2.5 else: boost_entity[term.strip()] = 1.0 return boost_entity
def tag_tweets(tweets): tagResults = CMUTweetTagger.runtagger_parse(tweets) result = [] for tweet in tagResults: #w = word, t = tag, c = confidence level tuplesList = [] for triple in tweet: (w, t, c) = triple #removing urls, user mentions, numbers, and hashtags from the tweet if t != 'U' and t != '@' and t!= '$' and t != '#': tuplesList.append((w,t)) result.append(tuplesList) return result
def tag_tweets(tweets): tagResults = CMUTweetTagger.runtagger_parse(tweets) result = [] for tweet in tagResults: #w = word, t = tag, c = confidence level tuplesList = [] for triple in tweet: (w, t, c) = triple #removing urls, user mentions, numbers, and hashtags from the tweet if t != 'U' and t != '@' and t != '$' and t != '#': tuplesList.append((w, t)) result.append(tuplesList) return result
def posTagged(tweets): taggedData = CMUTweetTagger.runtagger_parse(tweets) posTagsPerTweet = [] for tweet in taggedData: sumConf = 0.0 count = 0 for tokenTup in tweet: sumConf = sumConf + tokenTup[2] count = count + 1 avgConf = 0 if count == 0 else (sumConf/count) posTagsPerTweet.append((tweet, avgConf)) return posTagsPerTweet
def run(tweets, slang, not_oov, threshold=1.5, slang_threshold=1, max_val = [1., 1., 0.5, 0.0, 1.0, 0.5], distance = 2, oov_fun = ovvFunc): pos_tagged = CMUTweetTagger.runtagger_parse(tweets) window_size = 7 matrix1 = calc_score_matrix(pos_tagged, oov_fun, window_size,database='tweets2') if not slang: slang = tools.get_slangs() fms = add_slangs(matrix1,slang) fmd = add_from_dict(fms, matrix1, distance, not_oov) mapp = construct_mapp(pos_tagged,oov_fun) fm_reduced = add_nom_verbs(fmd,mapp ,slang_threshold=slang_threshold) feat_mat = iter_calc_lev(matrix1,fm_reduced, not_ovv = not_oov) res = calc_results(feat_mat, not_oov, max_val = max_val, threshold = threshold) return res
def resolve(self, original): #print 'resolve length: ', len(original) data = [self.normalizeKey(twt) for twt in set(original)] if enabled_modules['caches'] is not None: # Tag all uncached data uncached = [twt for twt in data if not self.cache.has_key(twt)] else: uncached = data #print uncached #print 'len : ', len(uncached) #print 'uncached: ' #for twt in uncached: print '\t', twt #print '\n\n\n' partial = [] if uncached: print 'uncached: ', len(uncached) partial = CMUTweetTagger.runtagger_parse( uncached, run_tagger_cmd=run_tagger_cmd) print 'partial: ', len(partial) if enabled_modules['caches'] is not None: for twt, tag in zip(uncached, partial): #print 'adding: ', twt self.cache.add_map(twt, tag) # Lookup all tags if enabled_modules['caches'] is not None: tagged = [self.cache.get_map(twt) for twt in data] else: tagged = partial #print 'TAGGED DATA' #print tagged # Store the data in the object self._toks = {} self._pos = {} for twt, tags in zip(data, tagged): # Last step of splitting compund words newToks, newTags = self.post_process_tokenize(tags) self._toks[twt] = newToks self._pos[twt] = newTags
def checkTweetNums(tweets, minTweets): #number as adjective check count = 0 processedtweets = [] for line in tweets: processedtweets.append(" ".join(wordsegment.segment(line))) postags = cmu.runtagger_parse(processedtweets) for postag in postags: postag = "".join(postag) if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag: #Checking for Consecutive numbers and Nouns count += 1 if count >= minTweets: return 1 else: return 0
def checkTweetNums(tweets,minTweets): #number as adjective check count = 0 processedtweets = [] for line in tweets: processedtweets.append(" ".join(wordsegment.segment(line))) postags = cmu.runtagger_parse(processedtweets) for postag in postags: postag = "".join(postag) if "$N" in postag or "$^" in postag or "$M" in postag or "$Z" in postag: #Checking for Consecutive numbers and Nouns count += 1 if count >= minTweets: return 1 else: return 0
def process_comments(target_list, post_list): hits = [] for comment in post_list: hit_ents = [] content = [comment['content']] ents = CMUTweetTagger.runtagger_parse(content)[0]; for ent in ents: enty = ent[0] if enty in target_list: hit_ents.append(enty) if len(hit_ents) > 0: comment['hit_ents'] = hit_ents hits.append(comment) return hits
def __tag(self, cleanedTweets): """Tag the cleaned tweets. Arguments: cleanedTweets {list} -- a list of cleaned tweets Returns: list -- [[(form, pos, score), ...], [(form, pos, score), ...], ...] """ print("The number of tweets before tagged {}".format( len(cleanedTweets))) taggedTweets = CMUTweetTagger.runtagger_parse(cleanedTweets) print("The number of tweets after tagged {}".format(len(taggedTweets))) self.helper.dumpJson(self.fileFolderPath, "tagged_tweets.json", taggedTweets) print("tagged_tweets.json has been saved.") return taggedTweets
def produce_entity_list(data): output_dict = {} for i in data.index: tweet = [data.loc[i, 'content']] ents = CMUTweetTagger.runtagger_parse(tweet)[0]; for ent in ents: if ent[0] in output_dict.keys(): output_dict[ent[0]]['count']+=1 else: if ent[1] in ['N', '^', 'S', 'Z', 'M', 'A']: enty = ent[0] pos = ent[1] output_dict[enty] = {'pos': pos, 'count': 1} output_df = pd.DataFrame.from_dict(output_dict, orient = 'index') output_df['entity'] =output_df.index output_df.index = range(len(output_df)) return output_df, output_dict
def get_tense_vectors(sents): tagged_sents = cmu.runtagger_parse(sents) feat_vecs = [] for tagged_sent in tagged_sents: feat_vec = [] # past feat_vec.append( len([word for word in tagged_sent if word[1] in ['VBD', 'VBN']])) # present feat_vec.append( len([ word for word in tagged_sent if (word[1] in ['VBP', 'VBG', 'VBZ'] or word[0] == 'now') ])) # future feat_vec.append(len([word for word in tagged_sent if word[1] == 'MD'])) feat_vecs.append(feat_vec) return feat_vecs
def annotate_pos(tweets, ptb=False): if ptb: tagger_cmd = RUN_TAGGER_CMD_PTB else: tagger_cmd = RUN_TAGGER_CMD ids = [] texts = [] for key, value in tweets.items(): ids.append(key) texts.append(json.dumps({'text':value['text']})) pos = CMUTweetTagger.runtagger_parse(texts, run_tagger_cmd=tagger_cmd) if len(ids) != len(pos): raise Exception("Error: Tweet Tagger returned incorrect results") for i in range(0, len(ids)): tweets[ids[i]]['pos'] = pos[i] tweets[ids[i]]['tokens'] = [tag[0] for tag in pos[i]] print pos[0] print [tag[0] for tag in pos[0]]
def resolve(self, original): #print 'resolve length: ', len(original) data = [self.normalizeKey(twt) for twt in set(original)] # Tag all uncached data uncached = [ twt for twt in data if not self.cache.has_key(twt) ] #print uncached #print 'len : ', len(uncached) #print 'uncached: ' #for twt in uncached: print '\t', twt #print '\n\n\n' if uncached: print 'uncached: ', len(uncached) partial = CMUTweetTagger.runtagger_parse(uncached) print 'partial: ', len(partial) for twt,tag in zip(uncached,partial): #print 'adding: ', twt self.cache.add_map(twt, tag) # Lookup all tags tagged = [ self.cache.get_map(twt) for twt in data ] #print 'TAGGED DATA' #print tagged # Store the data in the object self._toks = {} self._pos = {} for twt,tags in zip(data,tagged): # Last step of splitting compund words newToks,newTags = self.post_process_tokenize(tags) self._toks[twt] = newToks self._pos[twt] = newTags
def pos_tagger_writer(self): tweets = [] for tw in self.tweet_original: try: tw = tw.decode('unicode_escape').encode('ascii','ignore') except: tw = re.sub(r'\\+', '', tw) tw = tw.decode('unicode_escape').encode('ascii','ignore') tweets.append(tw) # tweets = [tw.encode('utf8') for tw in self.tweet_original[:3]] tweet_tags = {} sent_tags = CMUTweetTagger.runtagger_parse(tweets) fil_tweet = open(self.tagFile,'w') i = 0 for sent in sent_tags: is_question = 1 if self.is_question[i] == "yes" else 0 is_anserable = 1 if self.is_answerable[i] == "yes" else 0 pos = {"tags":sent, "is_question":is_question, "is_answerable":is_anserable} tweet_tags[self.tweet_id[i]] = pos i += 1 json.dump(tweet_tags,fil_tweet)
import nltk import CMUTweetTagger ex1 = "Before what happened at lunch when someone decided to piss me off.I had so much at the diabetes walk. Especially with my girls #TWERKTEAM!" #text1 = nltk.word_tokenize(ex1) #print nltk.pos_tag(text1) ex2 = "my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL" #text2 = nltk.word_tokenize(ex2) #print nltk.pos_tag(text2) print CMUTweetTagger.runtagger_parse([ "my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL" ])
def transform(self, X): return CMUTweetTagger.runtagger_parse(X)
tofile = open(argv[3],"w") #file to take output arff tofile.close() idiomsEx = file.readlines() list_type = file_type.readlines() sociallists = [] # to take hashtags in a list for line in idiomsEx: sociallists.append(line.replace("\n","")) parsedSociallists = [] #parse the hashtags using str2num library and add them as a list for line in sociallists: parsedSociallists.append(str2num.words2num(" ".join(ws.segment(line)))) postags = cmu.runtagger_parse(parsedSociallists) #gets a list of postags each for each hashtag i = 0 for ParsedTag,postag,type in zip(parsedSociallists,postags,list_type): checkTweetsret = checkTweets.checkTweets(ParsedTag,"test/"+str(i/100)+"tweets.txt") #checks for the hashtag in the files provided. i+=1 tofile = open(argv[3],"a") tofile.write(str(testFile1.test1(ParsedTag))+","+ #number of charcters in hashtag str(testFile2.test2(ParsedTag))+","+ #number of words in hashtag str(testFile4.test4(ParsedTag))+","+ #presence of days str(testFile5.numbercount(postag))+","+ # presence of numbers str(testFile5.prepositioncount(postag))+","+ #presence of prepositions
def DataPreprocessing(data, train=1): global docCount #EXTRACTING DENSE FEATURES sentiment = np.array([]) word_count = np.array([]) char_count = np.array([]) sent_count = np.array([]) syl_count = np.array([]) mention_count = np.array([]) url_count = np.array([]) special_count = np.array([]) cat_count = np.array([]) dic = Pyphen(lang='en') for text in data["tweet"]: blob = TextBlob(text) #OPTIONAL SPELLING CORRECTION #data.loc[docCount,"tweet"]=str(blob.correct()) #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"])) url_count = np.append(url_count, blob.words.count("URL")) mention_count = np.append(mention_count, blob.words.count("USER")) cat_count = np.append(cat_count, sum(c == '#' for c in text)) special_count = np.append( special_count, sum(not c.isalnum() and c != ' ' and c != '@' and c != '#' for c in text)) syl_count = np.append( syl_count, len(TextBlob(dic.inserted(text).replace('-', ' ')).words)) char_count = np.append(char_count, len(text)) word_count = np.append(word_count, len(blob.words)) sent_count = np.append(sent_count, len(blob.sentences)) sentiment = np.append(sentiment, blob.sentiment.polarity) docCount += 1 #INITIALIZING STEMMER AND STOP WORD CORPUS stop_words = set(stopwords.words('english')) porter_stemmer = PorterStemmer() #POS TAGGING POS = CMUTweetTagger.runtagger_parse(data["tweet"]) POSDictionary = { "N": "nn", "O": "pro", "S": "np", "^": "nnps", "Z": "nnpz", "L": "vl", "M": "nv", "V": "md", "A": "adj", "R": "adv", "!": "int", "D": "det", "P": "ppt", "&": "cc", "T": "rp", "X": "ex", "Y": "exv", "#": "cat", "@": "tar", "~": "dsc", ",": "punc", "$": "num", "U": "url", "E": "emo", "G": "abr" } #PREPROCESSING (REMOVE STOP WORDS AND STEMMING) docCount = 0 for doc in POS: filtered_sentence = [] for word in doc: if word[0] not in stop_words: filtered_sentence.append(porter_stemmer.stem( word[0])) #+'_'+POSDictionary[word[1]]) data.loc[docCount, "tweet"] = filtered_sentence data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"]) docCount += 1 #REPLACING LABEL (subtask) WITH INTEGER if (train == 1): data['label'] = data['subtask'].factorize()[0] data['sentiment'] = sentiment + 1 data['sent_count'] = sent_count data['word_count'] = word_count data['syl_count'] = syl_count data['url_count'] = url_count data['mention_count'] = mention_count data['cat_count'] = cat_count data['special_count'] = special_count #SEPERATING FEATURES AND LABELS X = data[[ 'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count', 'url_count', 'mention_count', 'special_count', 'cat_count' ]] if train == 1: y = data['label'] else: y = None return X, y
@Licence : This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ''' import CMUTweetTagger as cmu import wordsegment as ws file1 = open() file2 = open() data1 = file1.read() data2 = file2.read() tweets1 = data1.split("\n\n") hashtags = [] for tweet1 in tweets1: hashtag = tweet1.split("\n")[0] hashtags.append(" ".join(ws.segment(hashtag))) postags = cmu.runtagger_parse(hashtags) i=0 for postag in postags: if '$' in "".join(postag): i+=1
file = open(argv[1]) #file containing socialList and nonsocialList hashtags tofile = open(argv[2], "w") #file that takes the arff output tofile.close() idiomsEx = file.readlines() sociallists = [] for line in idiomsEx: sociallists.append(line.replace("\n", "")) parsedSociallists = [] for line in sociallists: parsedSociallists.append(" ".join(ws.segment(line))) postags = cmu.runtagger_parse(parsedSociallists) ''' file output would be in the format of popularity,precision at 10,precision at 20 in each line for every hashtag This takes a lot of time to run. ''' for ParsedTag, postag in zip(parsedSociallists, postags): tofile = open(argv[2], "a") a = testFile14.test14(ParsedTag, postag) #checks the hashtag in google and returns list of its popularity precision at 10 urls and 20 urls print str(a[0]) + "," + str(a[1]) + "," + str(a[2]) tofile.write(str(a[0]) + "," + str(a[1]) + "," + str(a[2]) + "\n") tofile.close()
file = open('../socialList.txt') idiomsEx = file.readlines() arr = [] i = 0 idioms = [] strlength = 0 word = 0 for x in idiomsEx: a = segment(x.replace("\n","")) strlength += len(x.replace("\n","")) idioms.append(" ".join(a)) word = word+len(a) postags = cmu.runtagger_parse(idioms) count = len(postags) nouns = 0 pronouns = 0 conjunctions = 0 interjections = 0 verbs = 0 adjectives = 0 adverbs = 0 prepositions = 0 for x in postags: tagscount = Counter(x) nouns += tagscount['N']+tagscount['^'] pronouns += tagscount['O']
for clfreq in freqTwCl.most_common(50): cl = clfreq[0] freq = clfreq[1] cluster_score[cl] = 0 if freq >= freq_th: #print "\n(cluster, freq):", clfreq clidx = (npindL == cl).nonzero()[0].tolist() cluster_centroid = X[clidx].sum(axis=0) #print "centroid_array:", cluster_centroid try: # cluster_tweet = vectorizer.inverse_transform(cluster_centroid) #get the words closest to center sim_word_list = model.most_similar( positive=[cluster_centroid], topn=20) pos_tokens = CMUTweetTagger.runtagger_parse( [term.upper() for term[0] in sim_word_list]) #print "detect entities", pos_tokens score = 0 for l in pos_tokens: term = '' for gr in range(0, len(l)): term += l[gr][0].lower() + " " if "^" in str(l): score += 2.5 else: score += 1.0 cluster_score[cl] = score except:
print "Xclean.shape:", Xclean.shape #print map_index_after_cleaning #play with scaling of X X = Xclean Xdense = np.matrix(X).astype('float') X_scaled = preprocessing.scale(Xdense) X_normalized = preprocessing.normalize(X_scaled, norm='l2') #transpose X to get features on the rows #Xt = X_scaled.T # #print "Xt.shape:", Xt.shape vocX = vectorizer.get_feature_names() #print "Vocabulary (tweets):", vocX #sys.exit() boost_entity = {} pos_tokens = CMUTweetTagger.runtagger_parse( [term.upper() for term in vocX]) #print "detect entities", pos_tokens for l in pos_tokens: term = '' for gr in range(0, len(l)): term += l[gr][0].lower() + " " if "^" in str(l): boost_entity[term.strip()] = 2.5 else: boost_entity[term.strip()] = 1.0 # print "boost_entity", sorted( ((v,k) for k,v in boost_entity.iteritems()), reverse=True) # boost_term_in_article = {} # for term in vocX: # if term in vocA: # #print "boost term in article:", term, vocA
def create_resource_list(global_need_resource_list, need_text): count = 0 for text in need_text: #output_test_file.write(str(count+1)+": "+text+"\n") source_list_3 = [] urls = re.findall(web_url, text) for i in urls: if len(i) > len('http://t.co'): source_list_3.append(i) text2 = tweet_preprocess(text) need_cmu_tags = CMUTweetTagger.runtagger_parse([text2]) text = tweet_preprocess2(text) quantity_dict = {} final_resource_keys = [] source_list = [] loc_list = [] poss_places = [] org_person_list = [] quantity_dict, final_resource_keys, source_list, poss_places, org_person_list = common_nouns.get_resource( text) for i in source_list_3: source_list.append(i) # print(count) print(text) doc = nlp(text) #need_tag.append(CMUTweetTagger.runtagger_parse([text])) loc_list = proper_noun.give_location(need_cmu_tags) for i in org_person_list: if i in loc_list: try: loc_list.remove(i) except: continue if i not in source_list: source_list.append(i) for i in loc_list: if i in source_list: try: source_list.remove(i) except: continue for i in poss_places: if i not in loc_list: #and location.is_inside_Nepal(i)==1: loc_list.append(i) for i in org_person_list: if i in final_resource_keys: try: final_resource_keys.remove(i) except: continue count = count + 1 final_resource_lists = [] for key in final_resource_keys: if key in quantity_dict: final_resource_lists.append(key.split(' ')[-1]) continue if key in text: final_resource_lists.append(key) post_preprocess(text, global_need_resource_list, final_resource_lists, quantity_dict, loc_list, source_list)
def getTweets(query): #pp = pprint.PrettyPrinter(indent=4) try: if not sample: tso = TwitterSearchOrder() # create a TwitterSearchOrder object print(query) tso.set_keywords([ query ]) # let's define all words we would like to have a look for tso.set_language('en') # we want to see English tweets only tso.set_include_entities( False ) # and don't give us all those entity information #from original code, idk what it does #my API data. ts = TwitterSearch( consumer_key='SwtLcZe9Im6q998K4cJqANs4n', consumer_secret= '7PMRM3ec7ltINPVl72FXurMn8Qg9HrS1NKwocYJVlTGngEFbEA', access_token= '51466054-cJUBESD4H9THIQExiKQ1HOGdR0GflXdyeIeL0TfKw', access_token_secret= 'nn3ESWtluVoLSNFexAKcEesF6rEg0lTJ4QaIbFHJACFDr') count = 1000 #how many tweets we want to see. we want as many as possible, but do not want to sacrifice load time too much i = 0 tweet_list = [] if sample: print("Reading Sample File") for line in file.read().split('\n'): tweet_list.append(line) else: print("Searching....") for tweet in ts.search_tweets_iterable(tso): if i >= count: break #stops getting tweets when we have enough #keep this line below as a reference. from the original code: #print( '@%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) ) words = tweet['text'] start = re.search("(((RT )?@(\w)*) ?:? )?", words) words = words.lstrip(start.group(0)) tweet_list.append(words) i += 1 # if we have less than 1000 tweets, the corpus is too short. if (len(tweet_list) < 1000): print( "Sorry! Your search did not return enough results, please try another." ) return print("Search complete!") print("Tagging...") tagged = CMU.runtagger_parse(sent_tokenize( "\n".join(tweet_list))) #tweetset)) print("Tagging complete!") print("Analyzing tags...") tag_table = Process.create_rules(tagged) syl_rules = Process.get_pos_syllables(tagged) rhyme_pos_table = SCD.rhyme_to_POS(tagged) print("Analysis Complete!") print("Generating poetry...") result1 = Process.generate_firsttwo(tag_table, syl_rules) r1 = result1[1] r2 = result1[2] firsttwo = result1[0] result2 = Process.generate_lasttwo(tag_table, syl_rules, rhyme_pos_table, r1, r2) lasttwo = result2 print("A poem about " + query + ":") print() print(firsttwo) print(lasttwo) except TwitterSearchException as e: # take care of all those ugly errors if there are some print(e)
sys.path.append(os.path.join(CMU_TWEET, "ark_tweet_nlp_python")) import CMUTweetTagger inp = sys.argv[1] DATA = sys.argv[2] tweets = [] with open(os.path.join(DATA, inp), "r") as f: for i, line in enumerate(f): fields = line.rstrip("\n").split("\t") tweets.append(fields[3].decode("utf-8")) tweets_parsed = CMUTweetTagger.runtagger_parse( tweets, run_tagger_cmd="java -XX:ParallelGCThreads=2 -Xmx500m -jar " + os.path.join(CMU_TWEET, "ark-tweet-nlp-0.3.2.jar")) data = [] with io.open(os.path.join(DATA, inp + ".proc"), "w", encoding="utf-8") as w: with open(os.path.join(DATA, inp), "r") as f: for i, line in enumerate(f): fields = line.rstrip("\n").split("\t") instance = {} instance["tweetid"] = fields[0] instance["userid"] = fields[1] instance["sentiment"] = fields[2] instance["tweet"] = [ e[0].decode("utf-8") for e in tweets_parsed[i] ] instance["pos"] = [e[1] for e in tweets_parsed[i]]
lemmatize_text(cleaned_text[5]) """ Fun with the CMU tagger https://github.com/brendano/ark-tweet-nlp http://www.ark.cs.cmu.edu/TweetNLP/ https://github.com/ianozsvald/ark-tweet-nlp-python """ import CMUTweetTagger # will wrap this with a web-service for text in cleaned_text[:10]: print text print CMUTweetTagger.runtagger_parse([text]) print #output: #AT_USER $aapl. apple's iphone has cracked. #[[('AT_USER', 'P', 0.5752), ('$aapl', '^', 0.7174), ('.', ',', 0.9668), ("apple's", 'Z', 0.6764), ('iphone', '^', 0.7309), ('has', 'V', 0.9833), ('cracked', 'V', 0.5413), ('.', ',', 0.9983)]] #$aapl is holding well in the bull flag. did you notice the golden cross on the daily? ;) URL #[[('$aapl', '^', 0.8645), ('is', 'V', 0.9961), ('holding', 'V', 0.9728), ('well', 'R', 0.8528), ('in', 'P', 0.9986), ('the', 'D', 0.9991), ('bull', 'N', 0.9745), ('flag', 'N', 0.9849), ('.', ',', 0.9979), ('did', 'V', 0.9994), ('you', 'O', 0.9957), ('notice', 'V', 0.9922), ('the', 'D', 0.999), ('golden', 'A', 0.4243), ('cross', 'N', 0.9899), ('on', 'P', 0.9987), ('the', 'D', 0.9991), ('daily', 'A', 0.5749), ('?', ',', 0.9897), (';)', 'E', 0.9774), ('URL', 'N', 0.4083)]] """ Interesting projects:
def build_graph(self, tweet): # input of this function is a text in the str format, "this is an sample input" exp_context_list_dict, meds_disease_dict, phrase_synonym_dict = self.get_ctxt_exp_list( ) cluster_dict = self.get_cluster_data() # Getting Np/NE cleaned_tweet = self.clean_tweets(tweet) entity_results = CMUTweetTagger.runtagger_parse( [cleaned_tweet]) # the input should be a list of texts # print("3- entity_results:" , entity_results) print("--- End Tagging Tweets ---") print("Tagged Ents: ", len(entity_results)) # For each tweet for i in tqdm(range(len(entity_results))): phrases_list = set(self.new_ne_extraction(entity_results[i])) type_list = [] b_syn_list = [] topic_list = [] disease_list = [] topic_links = [] if len(phrases_list) > 0: for ent in phrases_list: ent = (ent.replace("#", "")).lower().strip() ent = ''.join(e for e in ent if e.isalnum()) # if len(ent)>0: # if ent[0].isalpha() == False: # ent = ent[1:] # Types type_word = None if ent in exp_context_list_dict.keys(): type_word = exp_context_list_dict.get(ent) type_list.append(type_word) else: type_list.append("") # B-syn list if ent in phrase_synonym_dict.keys(): synonym = phrase_synonym_dict.get(ent) b_syn_list.append(synonym) else: b_syn_list.append("") # if "blue" in ent.lower().strip(): # print("Hit ", ent.lower().strip()) if ent.lower().strip() in cluster_dict.keys(): topic_list.append(cluster_dict.get( ent.lower().strip())) else: topic_list.append("") if type_word == "medication": if ent in meds_disease_dict.keys(): disease_list.append(meds_disease_dict.get(ent)) else: disease_list.append("") else: disease_list.append("") topics = list(set(topic_list)) if "" in topics: topics.remove("") for (phrase, typ, b_syn, topic, m) in zip(phrases_list, type_list, b_syn_list, topic_list, disease_list): if typ != "" or topic != "": topic_links.append((topic, phrase, b_syn, m, typ)) if topic_links == []: # print("Hit") continue # print(phrase,",",typ,",",b_syn,",",topic,",",m) tweet_dict = {} tweet_dict["topics"] = topics tweet_dict["topic_links"] = topic_links tweet_dict["tweet"] = tweet # print(tweet_dict) # print("--------------------------------------------") return tweet_dict
import nltk import CMUTweetTagger ex1 = "Before what happened at lunch when someone decided to piss me off.I had so much at the diabetes walk. Especially with my girls #TWERKTEAM!" #text1 = nltk.word_tokenize(ex1) #print nltk.pos_tag(text1) ex2 = "my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL" #text2 = nltk.word_tokenize(ex2) #print nltk.pos_tag(text2) print CMUTweetTagger.runtagger_parse(["my mum got me a tofee apple company.. Diabetes is definately tryna proceed to me LOL"])
def corpus_maker(tweets,positive_classification_type,corpus_filename ,negative_classification_type=""): y = [] corpus = [] corpus_dict = {} count = 0 miss_count=0 neg_count =0 labeled_tweet_ids = set() vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', charset_error='ignore') tweets_list=[] for row in tweets: # try: # row = i.decode('utf-8', 'ignore').split('?!~') # except UnicodeEncodeError as e: # miss_count += 1 # continue if(len(row) == 5): tweet_id = row[1] tweet_text = row[2].strip().replace("\n","") classification = row[4].strip() if classification == positive_classification_type: label = 1 tweets_list.append(tweet_text) y.append(label) # this is where the problem is elif classification != positive_classification_type: # print classification label = -1 tweets_list.append(tweet_text) y.append(label) neg_count += 1 labeled_tweet_ids.add(tweet_id) tagged_tweets = CMUTweetTagger.runtagger_parse(tweets_list, run_tagger_cmd=tweet_stack_command) print "len tagged tweets " + str(len(tweets_list)) print "len tagged tweets " + str(len(tagged_tweets)) for tagged_tweet in tagged_tweets: temp_list = [] # filtered words of a tweet for word in tagged_tweet: if word[1] != "U": temp_list.append(word[0].lower()) filtered_tweet = ' '.join(temp_list) corpus.append(filtered_tweet) corpus_dict[filtered_tweet] = tweet_id count += 1 vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', charset_error='ignore') counts = vectorizer.fit_transform(corpus) # rename counts to tfiidf_counts print vectorizer.get_feature_names() #transformer = TfidfTransformer() x = counts #transformer.fit_transform(counts) serialize_object( (y,x,vectorizer,labeled_tweet_ids), corpus_filename) return (y,x,vectorizer,labeled_tweet_ids)
tofile = open(argv[3],"w") #file to take output arff tofile.close() idiomsEx = file.readlines() list_type = file_type.readlines() sociallists = [] # to take hashtags in a list for line in idiomsEx: sociallists.append(line.replace("\n","")) parsedSociallists = [] #parse the hashtags using str2num library and add them as a list for line in sociallists: parsedSociallists.append(str2num.words2num(" ".join(ws.segment(line)))) postags = cmu.runtagger_parse(parsedSociallists) #gets a list of postags each for each hashtag i = 0 for ParsedTag,postag,type in zip(parsedSociallists,postags,list_type): checkTweetsret = checkTweets.checkTweets(ParsedTag.replace(" ",""),"test/"+str(i/100)+"tweets.txt") #checks for the hashtag in the files provided. i+=1 tofile = open(argv[3],"a") tofile.write(str(testFile1.test1(ParsedTag))+","+ #number of charcters in hashtag str(testFile2.test2(ParsedTag))+","+ #number of words in hashtag str(testFile4.test4(ParsedTag))+","+ #presence of days str(testFile5.numbercount(postag))+","+ # presence of numbers str(testFile5.prepositioncount(postag))+","+ #presence of prepositions
@Licence : This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/. ''' import CMUTweetTagger as cmu import wordsegment as ws file1 = open() file2 = open() data1 = file1.read() data2 = file2.read() tweets1 = data1.split("\n\n") hashtags = [] for tweet1 in tweets1: hashtag = tweet1.split("\n")[0] hashtags.append(" ".join(ws.segment(hashtag))) postags = cmu.runtagger_parse(hashtags) i = 0 for postag in postags: if '$' in "".join(postag): i += 1
import operator import my_feature_ex as fx import my_word_cloud as wcloud # <codecell> import nltk import json sys.path.append('/Users/doug/SW_Dev/ark-tweet-nlp-0.3.2') import CMUTweetTagger #print CMUTweetTagger.runtagger_parse(['example tweet 1', '@foo example tweet 2']) RUN_TAGGER_CMD = "java -XX:ParallelGCThreads=2 -Xmx500m -jar /Users/doug/SW_Dev/ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar" RUN_TAGGER_CMD_PTB = "java -XX:ParallelGCThreads=2 -Xmx500m -jar /Users/doug/SW_Dev/ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar --model /Users/doug/SW_Dev/ark-tweet-nlp-0.3.2/model.ritter_ptb_alldata_fixed.20130723.txt" print CMUTweetTagger.runtagger_parse(['example tweet 1', 'example tweet 2'], run_tagger_cmd=RUN_TAGGER_CMD) print CMUTweetTagger.runtagger_parse(['example tweet 1', 'example tweet 2'], run_tagger_cmd=RUN_TAGGER_CMD_PTB) def annotate_pos(tweets, ptb=False): if ptb: tagger_cmd = RUN_TAGGER_CMD_PTB else: tagger_cmd = RUN_TAGGER_CMD ids = [] texts = [] for key, value in tweets.items(): ids.append(key) texts.append(json.dumps({'text':value['text']})) pos = CMUTweetTagger.runtagger_parse(texts, run_tagger_cmd=tagger_cmd) if len(ids) != len(pos):
print "Xclean.shape:", Xclean.shape #print map_index_after_cleaning #play with scaling of X X = Xclean Xdense = np.matrix(X).astype('float') X_scaled = preprocessing.scale(Xdense) X_normalized = preprocessing.normalize(X_scaled, norm='l2') #transpose X to get features on the rows #Xt = X_scaled.T # #print "Xt.shape:", Xt.shape vocX = vectorizer.get_feature_names() #print "Vocabulary (tweets):", vocX #sys.exit() boost_entity = {} pos_tokens = CMUTweetTagger.runtagger_parse([term.upper() for term in vocX]) #print "detect entities", pos_tokens for l in pos_tokens: term ='' for gr in range(0, len(l)): term += l[gr][0].lower() + " " if "^" in str(l): boost_entity[term.strip()] = 2.5 else: boost_entity[term.strip()] = 1.0 # print "boost_entity", sorted( ((v,k) for k,v in boost_entity.iteritems()), reverse=True) # boost_term_in_article = {} # for term in vocX: # if term in vocA: # #print "boost term in article:", term, vocA