def format_generated_body_text_as_html(article_text, image_url=None): """ Given the text of the news story, format it in html so it looks more realistic - add paragraph breaks, turn urls into links, etc. """ # Add html links to twitter @ handles, hashtags and regular urls p = ttp.Parser() result = p.parse(article_text) article_text = result.html # Split the generated body into lines lines = article_text.split("\n") # Bold any short lines that look like section titles new_lines = [] for line in lines: if len(line) < 80 and not "." in line: line = f"<b>{line}</b>" new_lines.append(line) # Add paragraph tags between lines article_text = "<p>".join(new_lines) # If we have an image for the story, put it at the top. if image_url is not None: article_text = f"<img src='{image_url}'><p>{article_text}" return article_text
def link_urls(text): """ Enriches urls in the comment text with an anchor. """ parser = ttp.Parser(max_url_length=40) parser._urls = [] return ttp.URL_REGEX.sub(parser._parse_urls, text)
def tweet_meta_features(inputs): corpus = numpy.array([tweet.processed_text for tweet in inputs]) p = ttp.Parser() l = [] for tweet in corpus: ttp_parser = p.parse(tweet) # List of features features = [] does_tweet_contain_link = (len(ttp_parser.urls) >= 0) num_of_links_in_tweet = len(ttp_parser.urls) is_twitter_in_links = ('https://twitter.com/' in ttp_parser.urls) doest_tweet_contain_userref = (len(ttp_parser.users) >= 0) does_tweet_contain_hashtag = (len(ttp_parser.tags) >= 0) num_of_hashtags_in_tweet = len(ttp_parser.tags) happy_emojy_in_tweet = ('XD' in tweet or ':)' in tweet or '(:' in tweet or '=|' in tweet or '8D' in tweet or ':P' in tweet or ';D' in tweet) features = [ does_tweet_contain_link, num_of_links_in_tweet, is_twitter_in_links, doest_tweet_contain_userref, does_tweet_contain_hashtag, num_of_hashtags_in_tweet, happy_emojy_in_tweet ] l.append(features) return numpy.array(l)
def parseText(text): p = ttp.Parser() ttp_result = p.parse(text) text = extractUsernamesHashtagsURLS(ttp_result, text) text = normalizeTextForTagger(text) text = removeEmails(text) text = removeLineBreaks(text) # Loop over each character finding strange characters emojis = {} i = 1 for character in text: if (ord(character) > 128): #emojis[code] = span -> {code:span} pairs emojis[i] = str(ord(character)) i += 1 temp_text = list(text) for emoji in emojis: index = int(emoji) - 1 if emojis[emoji] in sentiments["sentiments"]["emojis"]: temp_text[index] = str( sentiments["sentiments"]["emojis"][emojis[emoji]]) + " " else: temp_text[index] = "" text = "".join(temp_text) return text #Return parsed/cleaned tweet
def render_html(self): if self.feed.type == 'TW': p = ttp.Parser() tt = p.parse(self.text) return tt.html else: return self.text
def req_twitter(search_items): # define twitter_authorizations _consumer_key = "axUO19ALyf4XGAjm12BgAwFFx" _consumer_secret = "4c4s1OEUBYUNnpcKkAqyhceoEkDn5Et7gfpTo3eBU2sAXCA3EY" _key = "219434782-bSCQK2hQD3KAjbWoH8IPIoWvTwXqHZZRnTcfamJk" _secret = "qKBldO1TyLeRteUNbsE1m4KKt04MHux8KipripqYsLh4m" _auth = OAuth1(_consumer_key, _consumer_secret, _key, _secret) p = ttp.Parser() # call twitter url = 'https://api.twitter.com/1.1/search/tweets.json' payload = { 'q': search_items, 'lang': 'en', 'result_type': 'mixed', 'count': '100' #'until': Get_Time()['today'] } r = requests.get(url=url, auth=_auth, params=payload) result = r.json() # Format the tweet for row in result['statuses']: tw_result = p.parse(row['text']) row['text_html'] = tw_result.html # Format the creation date tweet_date = format_date(row['created_at']) row['date_text'] = tweet_date return result
def save(self, *args, **kwargs): if not self.pk: super(Image, self).save(*args, **kwargs) p = ttp.Parser() result = p.parse(self.caption) for tag in result.tags: self.tags.add(tag) super(Image, self).save(*args, **kwargs)
def hashtags(self): p = ttp.Parser() result = p.parse(self.text) hashlist = result.tags l = list() for f in hashlist: l.append(str("#" + f)) return l
def save(self, *args, **kwargs): if not self.pk: super(Feed, self).save(*args, **kwargs) p = ttp.Parser() result = p.parse(self.post) for tag in result.tags: self.tags.add(tag) super(Feed, self).save(*args, **kwargs)
def process(self, tup): original_tweet = tup.values[3] ## Display Tweet ## p = ttp.Parser() result = p.parse(original_tweet) display_tweet = result.html ## Common pre-processing ## preprocess_tweet = original_tweet # Remove RT preprocess_tweet = re.sub(r"RT @", "@", preprocess_tweet) # Remove URL preprocess_tweet = re.sub(r"http\S+", "", preprocess_tweet) # Remove Hashtags preprocess_tweet = re.sub(r"#\S+", "", preprocess_tweet) # Remove Mentions preprocess_tweet = re.sub(r"@\S+", "", preprocess_tweet) # Remove whitespaces preprocess_tweet = re.sub("\s\s+" , " ", preprocess_tweet).strip() ## NLTK pre-processing ## nltk_text = preprocess_tweet # Remove unicode characters (including emojis) nltk_text = nltk_text.encode('ascii', 'ignore').decode('ascii') # Remove punctuations and digits regex = re.compile('[%s]' % re.escape(string.punctuation)) nltk_text = regex.sub('', nltk_text) regex = re.compile('[%s]' % re.escape(string.digits)) nltk_text = regex.sub('', nltk_text) # Lowercase nltk_text = nltk_text.lower() # Tokenization words_token = word_tokenize(nltk_text) # Remove english stopwords words_nostop = [w for w in words_token if w not in stopwords.words('english')] # Lemmatization lemmatizer = WordNetLemmatizer() words_lemma = ' '.join([lemmatizer.lemmatize(w) for w in words_nostop]) nltk_text = words_lemma ## StanfordNLP pre-processing ## # Convert emojis to text snlp_text = re.sub(r"[_:]", " ", preprocess_tweet) snlp_text = emoji.demojize(snlp_text) snlp_text = re.sub(r"_", " ", snlp_text) snlp_text = re.sub(r":", " \" ", snlp_text) snlp_text = re.sub("\s\s+" , " ", snlp_text).strip() ## Vader pre-processing ## vader_text = preprocess_tweet tuple = [tup.values[0], tup.values[1], tup.values[2], tup.values[3], tup.values[4], tup.values[5], tup.values[6], tup.values[7], tup.values[8], tup.values[9], display_tweet, snlp_text, vader_text, nltk_text] storm.emit(tuple)
def _format_instagram_message(instagram, full=True): # print instagram message_id = 'instagram_%s' % (instagram.id) message_date = instagram.created_time #datetime.datetime.fromtimestamp(int(instagram.created_time)) if not full: return {'message_id': message_id, 'message_date': message_date} """ {'caption': Comment: inertiaunlimited said "What's better than going down a water slide in the Bahamas? Capturing the fun in high-speed, of course. #atlantisresort #xmo #sportstech", 'comment_count': 1, 'comments': [Comment: atlantisresort said "Amazing!"], 'created_time': datetime.datetime(2015, 12, 7, 13, 33, 4), 'filter': 'Normal', 'id': '1134770313898096730_2142750893', 'images': {'low_resolution': Image: https://scontent.cdninstagram.com/hphotos-xpf1/t51.2885-15/s320x320/e35/12277430_1682421975377994_535438069_n.jpg, 'standard_resolution': Image: https://scontent.cdninstagram.com/hphotos-xpf1/t51.2885-15/s640x640/sh0.08/e35/12277430_1682421975377994_535438069_n.jpg, 'thumbnail': Image: https://scontent.cdninstagram.com/hphotos-xpf1/t51.2885-15/s150x150/e35/12277430_1682421975377994_535438069_n.jpg}, 'like_count': 1, 'likes': [User: itsofficialjojo], 'link': 'https://www.instagram.com/p/-_g5aVjfha/', 'location': Location: 145849377 (Point: (25.084288198, -77.321467938)), 'tags': [Tag: xmo, Tag: sportstech, Tag: atlantisresort], 'type': 'image', 'user': User: inertiaunlimited, 'users_in_photo': []} """ parser = ttp.Parser() try: caption = _cleanhtml(instagram.caption.text) except: caption = '' parsed = parser.parse(caption) message_url = instagram.link instagram_url = (instagram.get_standard_resolution_url()).replace( "http://", 'https://') text = "<figure><a href='%s'><img src='%s' alt='%s'></a>\ <figcaption>%s</figcaption></figure>" %(message_url, instagram_url, \ caption, _process_message_html(parsed.html)) message = { 'id': instagram.id, 'message_id': message_id, 'message_date': message_date, 'message_timesince': _timesince(message_date), 'user_name': instagram.user.full_name, 'user_screen_name': instagram.user.username, 'user_avatar_url': instagram.user.profile_picture, 'user_profile_url': 'https://instagram.com/%s' % instagram.user.username, 'message_url': message_url, 'message_html': text, 'hashes': [hashtag.name.lower() for hashtag in instagram.tags], 'raw_data': instagram } return message
def __init__(self, path): self.idxpath = path self.ix = open_dir(self.idxpath) self.query = MultifieldParser(['content', 'ctime'], schema=self.ix.schema) self.query.add_plugin(DateParserPlugin()) self.sorter = MultiFacet(["ctime", ScoreFacet()]) self.parser = ttp.Parser() self.dateparser = parser.parser()
def clean(text): p = ttp.Parser(include_spans=True) r = p.parse(text) result = text for user in r.users: init = user[1][0] end = user[1][1] result = result.replace(result[init:end], ' ' * (end - init)) print result
def parse(text): """ uses ttp package to parse out user handles, hashtags, urls, html from tweet's text """ p = ttp.Parser() result = p.parse(text) users = result.users tags = result.tags urls = result.urls html = result.html return [users, tags, urls, html]
def update_tags(self): # Remove any existing tags for this post PostTag.objects.filter(post=self).delete() # Parse post summary for tags parser = ttp.Parser() result = parser.parse(self.summary) # Save new tags for tag_text in result.tags: tag, created = Tag.objects.get_or_create(text=tag_text) print(tag) PostTag.objects.get_or_create(post=self, tag=tag)
def format_tweet_text(tweet_text): """Convert tweet text to html: add mentions, hashtags, emoji, links.""" # Emoji support tweet_processed_text = format_tweet_emojis(tweet_text) # Mentions, hashtags, links tweet_processed_text = ttp.Parser().parse(tweet_processed_text).html # Links customization tweet_processed_text = tweet_processed_text.replace( "<a ", '<a target="_blank" rel="noreferrer" ') return tweet_processed_text
def __init__(self): self.gender_name = GenderName() self.gender_slots = GenderSlots() self.db = pymongo.MongoClient().twitts self.dbTweets = pymongo.MongoClient().tweets self.twitter_parser = ttp.Parser() self.uniques = set() self.filter_regex = re.compile( r'.*(follow|siguen de vuelta en Twitter|una nueva foto en Facebook|He publicado|Acabo de publicar una foto|m at).*', re.UNICODE) self.html_parser = HTMLParser.HTMLParser()
def _getTermCounter(self, method, data): p = ttp.Parser() count_all = Counter() results = [] for item in data: tw_content = item['root_tweet'] if 'en' in tw_content['lang'] and not None: r = p.parse(tw_content['text']) count_all.update(getattr(r, method)) results.append(sum(count_all.values())) else: results.append(0) return results
def on_data(self, data): global unique_links # get the text from the tweet json_dict = json.loads(data) if 'text' not in json_dict: return True text = unicodedata.normalize('NFKD', json.loads(data)['text']).encode( 'ascii', 'ignore') parsed = ttp.Parser().parse(text) urls = parsed.urls # check if urls is empty and return immediately if not urls: return True # long_urls returns a dictionary of short_url to list[short_url, long_url] long_urls = utils.follow_shortlinks(urls) for url in urls: try: # use a regex to pull the link from the text links = long_urls[url] if links is not None: # long urls returns a dictionary with a list of each url - take the last one link = links[-1] # the link is real - now need to get the link address resp = urllib.urlopen(link) geturl = resp.geturl() parsedurl = urlparse(geturl) domain = '{uri.scheme}://{uri.netloc}/'.format( uri=parsedurl) if resp.getcode() == 200: # add the long link to our unique links, or cancel if there's already 1000 if len(unique_links) < 100: if link not in unique_links: if (domain != 'https://twitter.com/' and domain != 'https://t.co/'): f = open('url.txt', 'a') f.write(link + '\n') f.close() print link unique_links.add(link) else: return True return True else: return False except (IncompleteRead): pass
def _format_instagram_message_dict(instagram, full=True): # print instagram message_id = 'instagram_%s' % (instagram['id']) message_date = datetime.datetime.fromtimestamp( int(instagram['created_time'])) if not full: return {'message_id': message_id, 'message_date': message_date} # print instagram parser = ttp.Parser() try: caption = _cleanhtml(instagram['caption']['text']) except: caption = '' parsed = parser.parse(caption) message_url = instagram['link'] instagram_url = ( instagram['images']['standard_resolution']['url']).replace( "http://", 'https://') text = "<figure><a href='%s'><img src='%s' alt='%s'></a>\ <figcaption>%s</figcaption></figure>" %(message_url, instagram_url, \ caption, _process_message_html(parsed.html)) message = { 'message_id': message_id, 'message_date': message_date, 'message_timesince': _timesince(message_date), 'user_name': instagram['user']['full_name'], 'user_screen_name': instagram['user']['username'], 'user_avatar_url': instagram['user']['profile_picture'], 'user_profile_url': 'https://instagram.com/%s' % instagram['user']['username'], 'message_url': message_url, 'message_html': text, 'hashes': [hashtag.lower() for hashtag in instagram['tags']], 'raw_data': instagram } return message
def _format_twitter_message(tweet, full=True): # print tweet message_id = 'tweet_%s' % (tweet['id']) message_date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if not full: return {'message_id': message_id, 'message_date': message_date} parser = ttp.Parser() parsed = parser.parse(tweet['text']) message_url = 'https://twitter.com/%s/status/%s' % ( tweet['user']['screen_name'], tweet['id']) if tweet['entities'].has_key('media'): text = "<figure><a href='%s'><img src='%s:large' alt='%s'></a>\ <figcaption>%s</figcaption></figure>" %(message_url, tweet['entities']['media'][0]['media_url_https'], \ _cleanhtml(parsed.html), _process_message_html(parsed.html)) else: text = _process_message_html(parsed.html) return { 'id': tweet['id'], 'message_id': message_id, 'message_date': message_date, 'message_timesince': _timesince(message_date), 'user_name': tweet['user']['screen_name'], 'user_screen_name': tweet['user']['screen_name'], 'user_avatar_url': tweet['user']['profile_image_url_https'], 'user_profile_url': 'https://twitter.com/%s' % tweet['user']['screen_name'], 'message_url': message_url, 'message_html': text, 'hashes': [hashtag['text'].lower() for hashtag in tweet['entities']['hashtags']], 'raw_data': tweet }
def create_tweet(title, link): app.config['TWITTER_LINK_LEN_HTTPS'] max_title_len = 139 - int(app.config['TWITTER_LINK_LEN']) p = ttp.Parser(include_spans=True) result = p.parse(title) urlcount = len(result.urls) if urlcount > 0: urlchars = 0 for url in result.urls: urlchars = urlchars + (url[1][1] - url[1][0]) max_title_len = max_title_len\ - (urlcount * int(app.config['TWITTER_LINK_LEN']))\ + urlchars title_len = len(title) if title_len > max_title_len: title = title[:max_title_len - 1] + u"\u2026" return title + ' ' + link
def htmlify_description(json_data): """Passed the raw JSON data about a User from Twitter's API, it returns an HTMLified version of the User's description. * Replaces t.co URLs with clickable, full links. * Makes #hashtags into clickable links. * Makes @usernames into clickable links. Different to htmlify_tweet() because: * Twitter user data only includes entities for urls, not hashtags etc. https://twittercommunity.com/t/why-do-user-entities-have-only-urls-field-and-not-others/59181 * So we manually make the t.co links into their full, clickable version. * And then use twitter-text-python to linkify everything else. """ # I don't think users in the Twitter archive JSON have description # elements: try: desc = json_data['description'] except KeyError: return '' # Make t.co URLs into their original URLs, clickable. if 'entities' in json_data and 'description' in json_data['entities']: entities = json_data['entities']['description'] if 'urls' in entities: for entity in entities['urls']: start, end = entity['indices'][0], entity['indices'][1] shown_url = entity['display_url'] link_url = entity['expanded_url'] url_html = '<a href="%s" rel="external">%s</a>' desc = desc.replace(json_data['description'][start:end], url_html % (link_url, shown_url)) # Make #hashtags and @usernames clickable. parser = ttp.Parser() parsed = parser.parse(desc) return parsed.html
def vidDownload(saveThis, tweet): p = ttp.Parser() try: r = p.parse(tweet.decode('utf-8')) # print(r.urls) for link in r.urls: # print link resp = urllib.request.urlopen(link) print(resp.url) if "https://www.periscope.tv/w/" in resp.url: saveFile = open('twitDB2.txt','a') hash_url = abs(hash(resp.url)) saveFile.write(str(hash_url)) saveFile.write('\n') saveFile.write(saveThis) saveFile.write('\n') saveFile.close() process(hash_url,resp.url) except Exception as e: print(str(e)) pass
def make_tweets(find_id, original): #fix this error found = original[original['in_response_to_id'] == find_id] p = ttp.Parser() parsed = p.parse(found.iloc[0]['Tweet']) current_tweet = {} current_tweet['ID'] = found.iloc[0].in_response_to_id current_tweet['Tweet'] = found.iloc[0].Tweet current_tweet['Time'] = found.iloc[0].Time - (found.iloc[1].Time - found.iloc[0].Time) try: current_tweet['User'] = parsed.users[0] except: current_tweet['User'] = '' current_tweet['Likes'] = 0 current_tweet['Retweets'] = 0 current_tweet['in_response_to_id'] = 0 current_tweet['response_type'] = 'tweet' return pd.Series(current_tweet)
def get_tweets(request): datas = [] p = ttp.Parser() try: api = twitter.Api( consumer_key=settings.TWITTER_CONSUMER_KEY, consumer_secret=settings.TWITTER_CONSUMER_SECRET, access_token_key=settings.TWITTER_ACCESS_TOKEN, access_token_secret=settings.TWITTER_ACCESS_TOKEN_SECRET) tweets = api.GetUserTimeline(screen_name='kodlaco') for tweet in tweets: datas.append({ #'text': p.parse(tweet.text).html, 'text': tweet.text, 'id_str': tweet.id_str }) except: datas = [] return HttpResponse(json.dumps(datas), content_type="application/json")
def createAdjacencyList(inputFile): p = ttp.Parser() # create an adjacency list adjacencyList = {} with open(inputFile, 'r') as twitterFile: for line in twitterFile: line = line.rstrip('\n') t = line.split("\t") timestamp = int( datetime.strptime(t[0], "%Y-%m-%d %H:%M:%S").strftime("%s")) username = t[1] mentionedUsers = p.parse(t[2], html=False).users if len(mentionedUsers) > 0: if username not in adjacencyList: adjacencyList[username] = {} for m in mentionedUsers: if m not in adjacencyList[username]: adjacencyList[username][m] = {} adjacencyList[username][m]['timestamps'] = [timestamp] else: adjacencyList[username][m]['timestamps'] += [timestamp] adjacencyList[username][m]['timestamps'].sort() adjacencyList[username][m]['firstMention'] = adjacencyList[ username][m]['timestamps'][0] adjacencyList[username][m]['numberOfMentions'] = len( adjacencyList[username][m]['timestamps']) return adjacencyList
def clean(list_of_tweets_texts): p = ttp.Parser(include_spans=True) dataset = list_of_tweets_texts tweets = [] authors = [] dates = [] for mystring, username, date in dataset: try: mystring.encode('utf8', 'ignore') sentence = mystring result = p.parse(mystring) try: start = result.urls[0][1][0] end = result.urls[0][1][1] sentence = mystring[:start] + mystring[end:] except: pass tweets.append(sentence) authors.append(username) dates.append(date) except (IndexError, UnicodeDecodeError) as e: pass return tweets, authors, dates
def create_adjacency_list(file): """ Twitter Parser """ p = ttp.Parser() adj_list = {} tweet_counter = 0 with open(file, 'r') as tweets: for line in tweets: line = line.rstrip('\n') tweet = line.split("\t") timestamp = int( datetime.strptime(tweet[0], "%Y-%m-%d %H:%M:%S").strftime("%s")) username = tweet[1] result = p.parse(tweet[2], html=False).users if len(result) > 0: if username not in adj_list: adj_list[username] = {} for i in result: if i not in adj_list[username]: adj_list[username][i] = {} adj_list[username][i]['timestamps'] = [timestamp] else: adj_list[username][i]['timestamps'] += [timestamp] adj_list[username][i]['timestamps'].sort() adj_list[username][i]['first_mention'] = adj_list[ username][i]['timestamps'][0] adj_list[username][i]['number_of_mentions'] = len( adj_list[username][i]['timestamps']) return adj_list
filename = "" model = "" #Load data and classifier model data = pd.read_csv(filename) loaded_model = joblib.load(model) #Classify data predictions = loaded_model.predict(data['processed_tweet']) #Decode classification results data['sentiment'] = pd.Series(sentiments(predictions)) #Code from the assignment script, props to Aku Hiltunen parser = prep.Parser() mentions = {'user': [], 'times mentioned': [], 'positivity': [],\ 'negativity': [], 'neutrality': []} hashtags = {'hashtag': [], 'times used': [], 'positivity': [],\ 'negativity': [], 'neutrality': []} for row in data.itertuples(): tweet = parser.parse(row.text.encode('utf-8').decode('utf-8')) sentiment = row.sentiment for mention in tweet.users: ignore = 0 user_to = mention.lower() for index, user in enumerate(mentions['user']):