def format_generated_body_text_as_html(article_text, image_url=None):
    """
    Given the text of the news story, format it in html so it looks
    more realistic - add paragraph breaks, turn urls into links, etc.
    """
    # Add html links to twitter @ handles, hashtags and regular urls
    p = ttp.Parser()
    result = p.parse(article_text)
    article_text = result.html

    # Split the generated body into lines
    lines = article_text.split("\n")

    # Bold any short lines that look like section titles
    new_lines = []
    for line in lines:
        if len(line) < 80 and not "." in line:
            line = f"<b>{line}</b>"
        new_lines.append(line)

    # Add paragraph tags between lines
    article_text = "<p>".join(new_lines)
    
    # If we have an image for the story, put it at the top.
    if image_url is not None:
        article_text = f"<img src='{image_url}'><p>{article_text}"
 
    return article_text
Ejemplo n.º 2
0
def link_urls(text):
    """
    Enriches urls in the comment text with an anchor.
    """
    parser = ttp.Parser(max_url_length=40)
    parser._urls = []
    return ttp.URL_REGEX.sub(parser._parse_urls, text)
def tweet_meta_features(inputs):
    corpus = numpy.array([tweet.processed_text for tweet in inputs])

    p = ttp.Parser()

    l = []
    for tweet in corpus:
        ttp_parser = p.parse(tweet)

        # List of features
        features = []
        does_tweet_contain_link = (len(ttp_parser.urls) >= 0)
        num_of_links_in_tweet = len(ttp_parser.urls)
        is_twitter_in_links = ('https://twitter.com/' in ttp_parser.urls)
        doest_tweet_contain_userref = (len(ttp_parser.users) >= 0)
        does_tweet_contain_hashtag = (len(ttp_parser.tags) >= 0)
        num_of_hashtags_in_tweet = len(ttp_parser.tags)
        happy_emojy_in_tweet = ('XD' in tweet or ':)' in tweet or '(:' in tweet
                                or '=|' in tweet or '8D' in tweet
                                or ':P' in tweet or ';D' in tweet)

        features = [
            does_tweet_contain_link, num_of_links_in_tweet,
            is_twitter_in_links, doest_tweet_contain_userref,
            does_tweet_contain_hashtag, num_of_hashtags_in_tweet,
            happy_emojy_in_tweet
        ]

        l.append(features)

    return numpy.array(l)
Ejemplo n.º 4
0
def parseText(text):

    p = ttp.Parser()
    ttp_result = p.parse(text)
    text = extractUsernamesHashtagsURLS(ttp_result, text)
    text = normalizeTextForTagger(text)
    text = removeEmails(text)
    text = removeLineBreaks(text)

    # Loop over each character finding strange characters
    emojis = {}
    i = 1
    for character in text:
        if (ord(character) > 128):
            #emojis[code] = span -> {code:span} pairs
            emojis[i] = str(ord(character))
        i += 1

    temp_text = list(text)
    for emoji in emojis:
        index = int(emoji) - 1
        if emojis[emoji] in sentiments["sentiments"]["emojis"]:
            temp_text[index] = str(
                sentiments["sentiments"]["emojis"][emojis[emoji]]) + " "
        else:
            temp_text[index] = ""
    text = "".join(temp_text)

    return text  #Return parsed/cleaned tweet
Ejemplo n.º 5
0
 def render_html(self):
     if self.feed.type == 'TW':
         p = ttp.Parser()
         tt = p.parse(self.text)
         return tt.html
     else:
         return self.text
Ejemplo n.º 6
0
def req_twitter(search_items):
    # define twitter_authorizations
    _consumer_key = "axUO19ALyf4XGAjm12BgAwFFx"
    _consumer_secret = "4c4s1OEUBYUNnpcKkAqyhceoEkDn5Et7gfpTo3eBU2sAXCA3EY"
    _key = "219434782-bSCQK2hQD3KAjbWoH8IPIoWvTwXqHZZRnTcfamJk"
    _secret = "qKBldO1TyLeRteUNbsE1m4KKt04MHux8KipripqYsLh4m"

    _auth = OAuth1(_consumer_key, _consumer_secret, _key, _secret)

    p = ttp.Parser()

    # call twitter
    url = 'https://api.twitter.com/1.1/search/tweets.json'
    payload = {
        'q': search_items,
        'lang': 'en',
        'result_type': 'mixed',
        'count': '100'
        #'until': Get_Time()['today']
    }

    r = requests.get(url=url, auth=_auth, params=payload)
    result = r.json()

    # Format the tweet
    for row in result['statuses']:
        tw_result = p.parse(row['text'])
        row['text_html'] = tw_result.html

        # Format the creation date
        tweet_date = format_date(row['created_at'])
        row['date_text'] = tweet_date

    return result
Ejemplo n.º 7
0
 def save(self, *args, **kwargs):
     if not self.pk:
         super(Image, self).save(*args, **kwargs)
     p = ttp.Parser()
     result = p.parse(self.caption)
     for tag in result.tags:
         self.tags.add(tag)
     super(Image, self).save(*args, **kwargs)
Ejemplo n.º 8
0
 def hashtags(self):
     p = ttp.Parser()
     result = p.parse(self.text)
     hashlist = result.tags
     l = list()
     for f in hashlist:
         l.append(str("#" + f))
     return l
Ejemplo n.º 9
0
 def save(self, *args, **kwargs):
     if not self.pk:
         super(Feed, self).save(*args, **kwargs)
     p = ttp.Parser()
     result = p.parse(self.post)
     for tag in result.tags:
         self.tags.add(tag)
     super(Feed, self).save(*args, **kwargs)
Ejemplo n.º 10
0
    def process(self, tup):
    	original_tweet = tup.values[3]

        ## Display Tweet ##
        p = ttp.Parser()
        result = p.parse(original_tweet)
        display_tweet = result.html
        
		## Common pre-processing ##
        preprocess_tweet = original_tweet
        # Remove RT
        preprocess_tweet = re.sub(r"RT @", "@", preprocess_tweet)
    	# Remove URL
    	preprocess_tweet = re.sub(r"http\S+", "", preprocess_tweet)
    	# Remove Hashtags
    	preprocess_tweet = re.sub(r"#\S+", "", preprocess_tweet)
    	# Remove Mentions
    	preprocess_tweet = re.sub(r"@\S+", "", preprocess_tweet)       
    	# Remove whitespaces
    	preprocess_tweet = re.sub("\s\s+" , " ", preprocess_tweet).strip()
    	
        ## NLTK pre-processing ##
        nltk_text = preprocess_tweet
        # Remove unicode characters (including emojis)
        nltk_text = nltk_text.encode('ascii', 'ignore').decode('ascii')
        # Remove punctuations and digits
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        nltk_text = regex.sub('', nltk_text)
        regex = re.compile('[%s]' % re.escape(string.digits))
        nltk_text = regex.sub('', nltk_text)
        # Lowercase
        nltk_text = nltk_text.lower()
        # Tokenization
        words_token = word_tokenize(nltk_text)
        # Remove english stopwords
        words_nostop = [w for w in words_token if w not in stopwords.words('english')]
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        words_lemma = ' '.join([lemmatizer.lemmatize(w) for w in words_nostop])
        nltk_text = words_lemma
        
    	## StanfordNLP pre-processing ##
        # Convert emojis to text
        snlp_text = re.sub(r"[_:]", " ", preprocess_tweet) 
    	snlp_text = emoji.demojize(snlp_text)
        snlp_text = re.sub(r"_", " ", snlp_text)
        snlp_text = re.sub(r":", " \" ", snlp_text)
        snlp_text = re.sub("\s\s+" , " ", snlp_text).strip()
            	
    	## Vader pre-processing ##
    	vader_text = preprocess_tweet

        tuple = [tup.values[0], tup.values[1], tup.values[2], tup.values[3],
        	tup.values[4], tup.values[5], tup.values[6], tup.values[7],
        	tup.values[8], tup.values[9], display_tweet,
            snlp_text, vader_text, nltk_text]

        storm.emit(tuple)
Ejemplo n.º 11
0
def _format_instagram_message(instagram, full=True):
    # print instagram
    message_id = 'instagram_%s' % (instagram.id)
    message_date = instagram.created_time  #datetime.datetime.fromtimestamp(int(instagram.created_time))

    if not full:
        return {'message_id': message_id, 'message_date': message_date}
    """
    {'caption': Comment: inertiaunlimited said "What's better than going down a water slide in the Bahamas? Capturing the fun in high-speed, of course. #atlantisresort #xmo #sportstech",
 'comment_count': 1,
 'comments': [Comment: atlantisresort said "Amazing!"],
 'created_time': datetime.datetime(2015, 12, 7, 13, 33, 4),
 'filter': 'Normal',
 'id': '1134770313898096730_2142750893',
 'images': {'low_resolution': Image: https://scontent.cdninstagram.com/hphotos-xpf1/t51.2885-15/s320x320/e35/12277430_1682421975377994_535438069_n.jpg,
            'standard_resolution': Image: https://scontent.cdninstagram.com/hphotos-xpf1/t51.2885-15/s640x640/sh0.08/e35/12277430_1682421975377994_535438069_n.jpg,
            'thumbnail': Image: https://scontent.cdninstagram.com/hphotos-xpf1/t51.2885-15/s150x150/e35/12277430_1682421975377994_535438069_n.jpg},
 'like_count': 1,
 'likes': [User: itsofficialjojo],
 'link': 'https://www.instagram.com/p/-_g5aVjfha/',
 'location': Location: 145849377 (Point: (25.084288198, -77.321467938)),
 'tags': [Tag: xmo, Tag: sportstech, Tag: atlantisresort],
 'type': 'image',
 'user': User: inertiaunlimited,
 'users_in_photo': []}
    """

    parser = ttp.Parser()
    try:
        caption = _cleanhtml(instagram.caption.text)
    except:
        caption = ''
    parsed = parser.parse(caption)

    message_url = instagram.link
    instagram_url = (instagram.get_standard_resolution_url()).replace(
        "http://", 'https://')
    text = "<figure><a href='%s'><img src='%s' alt='%s'></a>\
        <figcaption>%s</figcaption></figure>"                                             %(message_url, instagram_url, \
        caption, _process_message_html(parsed.html))

    message = {
        'id': instagram.id,
        'message_id': message_id,
        'message_date': message_date,
        'message_timesince': _timesince(message_date),
        'user_name': instagram.user.full_name,
        'user_screen_name': instagram.user.username,
        'user_avatar_url': instagram.user.profile_picture,
        'user_profile_url':
        'https://instagram.com/%s' % instagram.user.username,
        'message_url': message_url,
        'message_html': text,
        'hashes': [hashtag.name.lower() for hashtag in instagram.tags],
        'raw_data': instagram
    }
    return message
Ejemplo n.º 12
0
 def __init__(self, path):
     self.idxpath = path
     self.ix = open_dir(self.idxpath)
     self.query = MultifieldParser(['content', 'ctime'],
                                   schema=self.ix.schema)
     self.query.add_plugin(DateParserPlugin())
     self.sorter = MultiFacet(["ctime", ScoreFacet()])
     self.parser = ttp.Parser()
     self.dateparser = parser.parser()
Ejemplo n.º 13
0
def clean(text):
    p = ttp.Parser(include_spans=True)
    r = p.parse(text)
    result = text
    for user in r.users:
        init = user[1][0]
        end = user[1][1]
        result = result.replace(result[init:end], ' ' * (end - init))
    print result
Ejemplo n.º 14
0
def parse(text):
    """
    uses ttp package to parse out user handles, hashtags, urls, html from tweet's text
    """
    p = ttp.Parser()
    result = p.parse(text)
    users = result.users
    tags = result.tags
    urls = result.urls
    html = result.html
    return [users, tags, urls, html]
Ejemplo n.º 15
0
 def update_tags(self):
     # Remove any existing tags for this post
     PostTag.objects.filter(post=self).delete()
     # Parse post summary for tags
     parser = ttp.Parser()
     result = parser.parse(self.summary)
     # Save new tags
     for tag_text in result.tags:
         tag, created = Tag.objects.get_or_create(text=tag_text)
         print(tag)
         PostTag.objects.get_or_create(post=self, tag=tag)
Ejemplo n.º 16
0
def format_tweet_text(tweet_text):
    """Convert tweet text to html: add mentions, hashtags, emoji, links."""
    # Emoji support
    tweet_processed_text = format_tweet_emojis(tweet_text)
    # Mentions, hashtags, links
    tweet_processed_text = ttp.Parser().parse(tweet_processed_text).html
    # Links customization
    tweet_processed_text = tweet_processed_text.replace(
        "<a ", '<a target="_blank" rel="noreferrer" ')

    return tweet_processed_text
Ejemplo n.º 17
0
 def __init__(self):
     self.gender_name = GenderName()
     self.gender_slots = GenderSlots()
     self.db = pymongo.MongoClient().twitts
     self.dbTweets = pymongo.MongoClient().tweets
     self.twitter_parser = ttp.Parser()
     self.uniques = set()
     self.filter_regex = re.compile(
         r'.*(follow|siguen de vuelta en Twitter|una nueva foto en Facebook|He publicado|Acabo de publicar una foto|m at).*',
         re.UNICODE)
     self.html_parser = HTMLParser.HTMLParser()
Ejemplo n.º 18
0
 def _getTermCounter(self, method, data):
     p = ttp.Parser()
     count_all = Counter()
     results = []
     for item in data:
         tw_content = item['root_tweet']
         if 'en' in tw_content['lang'] and not None:
             r = p.parse(tw_content['text'])
             count_all.update(getattr(r, method))
             results.append(sum(count_all.values()))
         else:
             results.append(0)
     return results
Ejemplo n.º 19
0
Archivo: hw2.py Proyecto: Rokuji/Python
    def on_data(self, data):
        global unique_links
        # get the text from the tweet
        json_dict = json.loads(data)

        if 'text' not in json_dict:
            return True
        text = unicodedata.normalize('NFKD',
                                     json.loads(data)['text']).encode(
                                         'ascii', 'ignore')
        parsed = ttp.Parser().parse(text)
        urls = parsed.urls
        # check if urls is empty and return immediately
        if not urls:
            return True

        # long_urls returns a dictionary of short_url to list[short_url, long_url]
        long_urls = utils.follow_shortlinks(urls)
        for url in urls:
            try:
                # use a regex to pull the link from the text
                links = long_urls[url]
                if links is not None:
                    # long urls returns a dictionary with a list of each url - take the last one
                    link = links[-1]
                    # the link is real - now need to get the link address
                    resp = urllib.urlopen(link)
                    geturl = resp.geturl()
                    parsedurl = urlparse(geturl)
                    domain = '{uri.scheme}://{uri.netloc}/'.format(
                        uri=parsedurl)
                    if resp.getcode() == 200:
                        # add the long link to our unique links, or cancel if there's already 1000
                        if len(unique_links) < 100:
                            if link not in unique_links:
                                if (domain != 'https://twitter.com/'
                                        and domain != 'https://t.co/'):
                                    f = open('url.txt', 'a')
                                    f.write(link + '\n')
                                    f.close()
                                    print link
                                    unique_links.add(link)
                                else:
                                    return True

                            return True
                        else:
                            return False
            except (IncompleteRead):
                pass
Ejemplo n.º 20
0
def _format_instagram_message_dict(instagram, full=True):
    # print instagram
    message_id = 'instagram_%s' % (instagram['id'])
    message_date = datetime.datetime.fromtimestamp(
        int(instagram['created_time']))

    if not full:
        return {'message_id': message_id, 'message_date': message_date}

    # print instagram

    parser = ttp.Parser()
    try:
        caption = _cleanhtml(instagram['caption']['text'])
    except:
        caption = ''
    parsed = parser.parse(caption)

    message_url = instagram['link']
    instagram_url = (
        instagram['images']['standard_resolution']['url']).replace(
            "http://", 'https://')
    text = "<figure><a href='%s'><img src='%s' alt='%s'></a>\
        <figcaption>%s</figcaption></figure>"                                             %(message_url, instagram_url, \
        caption, _process_message_html(parsed.html))

    message = {
        'message_id':
        message_id,
        'message_date':
        message_date,
        'message_timesince':
        _timesince(message_date),
        'user_name':
        instagram['user']['full_name'],
        'user_screen_name':
        instagram['user']['username'],
        'user_avatar_url':
        instagram['user']['profile_picture'],
        'user_profile_url':
        'https://instagram.com/%s' % instagram['user']['username'],
        'message_url':
        message_url,
        'message_html':
        text,
        'hashes': [hashtag.lower() for hashtag in instagram['tags']],
        'raw_data':
        instagram
    }
    return message
Ejemplo n.º 21
0
def _format_twitter_message(tweet, full=True):
    # print tweet
    message_id = 'tweet_%s' % (tweet['id'])
    message_date = datetime.datetime.strptime(tweet['created_at'],
                                              '%a %b %d %H:%M:%S +0000 %Y')

    if not full:
        return {'message_id': message_id, 'message_date': message_date}

    parser = ttp.Parser()
    parsed = parser.parse(tweet['text'])
    message_url = 'https://twitter.com/%s/status/%s' % (
        tweet['user']['screen_name'], tweet['id'])

    if tweet['entities'].has_key('media'):
        text = "<figure><a href='%s'><img src='%s:large' alt='%s'></a>\
            <figcaption>%s</figcaption></figure>"                                                 %(message_url, tweet['entities']['media'][0]['media_url_https'], \
            _cleanhtml(parsed.html), _process_message_html(parsed.html))
    else:
        text = _process_message_html(parsed.html)

    return {
        'id':
        tweet['id'],
        'message_id':
        message_id,
        'message_date':
        message_date,
        'message_timesince':
        _timesince(message_date),
        'user_name':
        tweet['user']['screen_name'],
        'user_screen_name':
        tweet['user']['screen_name'],
        'user_avatar_url':
        tweet['user']['profile_image_url_https'],
        'user_profile_url':
        'https://twitter.com/%s' % tweet['user']['screen_name'],
        'message_url':
        message_url,
        'message_html':
        text,
        'hashes':
        [hashtag['text'].lower() for hashtag in tweet['entities']['hashtags']],
        'raw_data':
        tweet
    }
Ejemplo n.º 22
0
def create_tweet(title, link):
    app.config['TWITTER_LINK_LEN_HTTPS']
    max_title_len = 139 - int(app.config['TWITTER_LINK_LEN'])
    p = ttp.Parser(include_spans=True)
    result = p.parse(title)
    urlcount = len(result.urls)
    if urlcount > 0:
        urlchars = 0
        for url in result.urls:
            urlchars = urlchars + (url[1][1] - url[1][0])
        max_title_len = max_title_len\
            - (urlcount * int(app.config['TWITTER_LINK_LEN']))\
            + urlchars
    title_len = len(title)
    if title_len > max_title_len:
        title = title[:max_title_len - 1] + u"\u2026"
    return title + ' ' + link
Ejemplo n.º 23
0
def htmlify_description(json_data):
    """Passed the raw JSON data about a User from Twitter's API, it returns an
    HTMLified version of the User's description.
    * Replaces t.co URLs with clickable, full links.
    * Makes #hashtags into clickable links.
    * Makes @usernames into clickable links.

    Different to htmlify_tweet() because:

        * Twitter user data only includes entities for urls, not hashtags etc.
          https://twittercommunity.com/t/why-do-user-entities-have-only-urls-field-and-not-others/59181

        * So we manually make the t.co links into their full, clickable version.
        * And then use twitter-text-python to linkify everything else.
    """

    # I don't think users in the Twitter archive JSON have description
    # elements:
    try:
        desc = json_data['description']
    except KeyError:
        return ''

    # Make t.co URLs into their original URLs, clickable.
    if 'entities' in json_data and 'description' in json_data['entities']:
        entities = json_data['entities']['description']

        if 'urls' in entities:
            for entity in entities['urls']:
                start, end = entity['indices'][0], entity['indices'][1]
                shown_url = entity['display_url']
                link_url = entity['expanded_url']

                url_html = '<a href="%s" rel="external">%s</a>'
                desc = desc.replace(json_data['description'][start:end],
                                            url_html % (link_url, shown_url))

    # Make #hashtags and @usernames clickable.
    parser = ttp.Parser()
    parsed = parser.parse(desc)

    return parsed.html
Ejemplo n.º 24
0
def vidDownload(saveThis, tweet):
    p = ttp.Parser()    
    try:
        r = p.parse(tweet.decode('utf-8'))
        # print(r.urls)
        for link in r.urls:
            # print link
            resp = urllib.request.urlopen(link)
            print(resp.url)
            if "https://www.periscope.tv/w/" in resp.url:
                saveFile = open('twitDB2.txt','a')
                hash_url = abs(hash(resp.url))
                saveFile.write(str(hash_url))
                saveFile.write('\n')
                saveFile.write(saveThis)
                saveFile.write('\n')
                saveFile.close() 
                process(hash_url,resp.url)
    except Exception as e:
        print(str(e))
        pass 
Ejemplo n.º 25
0
def make_tweets(find_id, original):
    #fix this error
    found = original[original['in_response_to_id'] == find_id]
    p = ttp.Parser()
    parsed = p.parse(found.iloc[0]['Tweet'])

    current_tweet = {}

    current_tweet['ID'] = found.iloc[0].in_response_to_id
    current_tweet['Tweet'] = found.iloc[0].Tweet
    current_tweet['Time'] = found.iloc[0].Time - (found.iloc[1].Time -
                                                  found.iloc[0].Time)
    try:
        current_tweet['User'] = parsed.users[0]
    except:
        current_tweet['User'] = ''
    current_tweet['Likes'] = 0
    current_tweet['Retweets'] = 0
    current_tweet['in_response_to_id'] = 0
    current_tweet['response_type'] = 'tweet'

    return pd.Series(current_tweet)
Ejemplo n.º 26
0
def get_tweets(request):
    datas = []
    p = ttp.Parser()

    try:
        api = twitter.Api(
            consumer_key=settings.TWITTER_CONSUMER_KEY,
            consumer_secret=settings.TWITTER_CONSUMER_SECRET,
            access_token_key=settings.TWITTER_ACCESS_TOKEN,
            access_token_secret=settings.TWITTER_ACCESS_TOKEN_SECRET)

        tweets = api.GetUserTimeline(screen_name='kodlaco')
        for tweet in tweets:
            datas.append({
                #'text': p.parse(tweet.text).html,
                'text': tweet.text,
                'id_str': tweet.id_str
            })
    except:
        datas = []

    return HttpResponse(json.dumps(datas), content_type="application/json")
Ejemplo n.º 27
0
def createAdjacencyList(inputFile):
    p = ttp.Parser()

    # create an adjacency list

    adjacencyList = {}

    with open(inputFile, 'r') as twitterFile:
        for line in twitterFile:

            line = line.rstrip('\n')
            t = line.split("\t")

            timestamp = int(
                datetime.strptime(t[0], "%Y-%m-%d %H:%M:%S").strftime("%s"))
            username = t[1]
            mentionedUsers = p.parse(t[2], html=False).users

            if len(mentionedUsers) > 0:

                if username not in adjacencyList:
                    adjacencyList[username] = {}

                for m in mentionedUsers:
                    if m not in adjacencyList[username]:
                        adjacencyList[username][m] = {}
                        adjacencyList[username][m]['timestamps'] = [timestamp]
                    else:
                        adjacencyList[username][m]['timestamps'] += [timestamp]

                    adjacencyList[username][m]['timestamps'].sort()
                    adjacencyList[username][m]['firstMention'] = adjacencyList[
                        username][m]['timestamps'][0]
                    adjacencyList[username][m]['numberOfMentions'] = len(
                        adjacencyList[username][m]['timestamps'])

    return adjacencyList
Ejemplo n.º 28
0
def clean(list_of_tweets_texts):
    p = ttp.Parser(include_spans=True)
    dataset = list_of_tweets_texts
    tweets = []
    authors = []
    dates = []
    for mystring, username, date in dataset:
        try:

            mystring.encode('utf8', 'ignore')
            sentence = mystring
            result = p.parse(mystring)
            try:
                start = result.urls[0][1][0]
                end = result.urls[0][1][1]
                sentence = mystring[:start] + mystring[end:]
            except:
                pass
            tweets.append(sentence)
            authors.append(username)
            dates.append(date)
        except (IndexError, UnicodeDecodeError) as e:
            pass
    return tweets, authors, dates
Ejemplo n.º 29
0
def create_adjacency_list(file):
    """
    Twitter Parser 
    """
    p = ttp.Parser()
    adj_list = {}
    tweet_counter = 0
    with open(file, 'r') as tweets:
        for line in tweets:
            line = line.rstrip('\n')
            tweet = line.split("\t")

            timestamp = int(
                datetime.strptime(tweet[0],
                                  "%Y-%m-%d %H:%M:%S").strftime("%s"))
            username = tweet[1]
            result = p.parse(tweet[2], html=False).users

            if len(result) > 0:
                if username not in adj_list:
                    adj_list[username] = {}

                for i in result:
                    if i not in adj_list[username]:
                        adj_list[username][i] = {}
                        adj_list[username][i]['timestamps'] = [timestamp]
                    else:
                        adj_list[username][i]['timestamps'] += [timestamp]

                    adj_list[username][i]['timestamps'].sort()
                    adj_list[username][i]['first_mention'] = adj_list[
                        username][i]['timestamps'][0]
                    adj_list[username][i]['number_of_mentions'] = len(
                        adj_list[username][i]['timestamps'])

    return adj_list
Ejemplo n.º 30
0
filename = ""
model = ""

#Load data and classifier model
data = pd.read_csv(filename)
loaded_model = joblib.load(model)

#Classify data
predictions = loaded_model.predict(data['processed_tweet'])

#Decode classification results
data['sentiment'] = pd.Series(sentiments(predictions))

#Code from the assignment script, props to Aku Hiltunen
parser = prep.Parser()

mentions = {'user': [], 'times mentioned': [], 'positivity': [],\
            'negativity': [], 'neutrality': []}
hashtags = {'hashtag': [], 'times used': [], 'positivity': [],\
            'negativity': [], 'neutrality': []}

for row in data.itertuples():
    tweet = parser.parse(row.text.encode('utf-8').decode('utf-8'))
    sentiment = row.sentiment

    for mention in tweet.users:
        ignore = 0
        user_to = mention.lower()

        for index, user in enumerate(mentions['user']):