Python parseの例、preprocessor.parse Pythonの例

コード例 #1

0

ファイルを表示

def preprocess(data):
    lang = data['lang']
    text_xx = 'text_' + lang

    data['tweet_date'] = datetime.datetime.strptime(data['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%dT%H:00:00.00Z')

    if 'extended_tweet' in data:
        data['tweet_text'] = data['extended_tweet']['full_text']
    else:
        data['tweet_text'] = data['text']
    
    if 'geo' in data and data['geo'] and 'coordinates' in data['geo'] and 'coordinates' in data['geo']['coordinates']:
        data['tweet_loc'] = ','.join(str(x) for x in data['geo']['coordinates']['coordinates'])
    elif 'place' in data and data['place'] and 'bounding_box' in data['place']:
        data['tweet_loc'] = ','.join(str(x) for x in data['place']['bounding_box']['coordinates'][0][0])

    parsed_text = p.parse(data['tweet_text'])
    data['tweet_emoticons'] = [t.match for e in [parsed_text.emojis, parsed_text.smileys] if e is not None for t in e ]
    data[text_xx] = p.clean(data['tweet_text']).lower()
    # words = re.findall(r'\w+', data[text_xx], flags = re.UNICODE)
    # if lang in lang_map:
    #     data[text_xx] = remove_stop_words(stopwords.words(lang_map[lang]), words)
    # elif lang == 'hi':
    #     data[text_xx] = remove_stop_words(hindi_words, words)
    # else:
    #     data[text_xx] = remove_stop_words(thai_words, words)
    return data

コード例 #2

0

ファイルを表示

ファイル: data_parser.py プロジェクト: oriyanh/IML-hackathon-19

    def preprocess_data(X):
        """

        :param X:
        :return:
        """

        X_new = np.array(X)
        for i, x in enumerate(X_new):
            # For each sample tweet x in the domain X_new, clean all URLs and numbers
            p.set_options(*CLEAN_OPTION_SET)
            s = p.clean(x)
            # Replace all caps words with placeholder ALL_CAPS_WORD
            split_tweet = s.split()
            for w in split_tweet:
                if w.isupper():
                    s = s.replace(w, "ALL_CAPS_WORD")
            p.set_options(*PARSE_OPTION_SET)
            # Figure out if the tweet is a retweet - if so, replace it with the person being retweeted
            tweet = s
            s = p.parse(tweet)
            p.set_options(*TOKENIZING_OPTION_SET)
            if s.reserved is not None and "RT" in s.reserved[0].match:
                tweet_modified = s.mentions[0].match
            else:
                tweet_modified = p.tokenize(tweet)
            # Replace all special characters with placeholders to make them unique
            for ch in SPECIAL_CHAR_SET:
                tweet_modified = tweet_modified.replace(
                    ch, SPECIAL_CHAR_SET[ch])
            X_new[i] = tweet_modified
        return X_new

コード例 #3

0

ファイルを表示

def clean_tweet_text(tweet_text): 
    tweet_text = tweet_text.replace("’", "'").replace("…", "...")
    tweet_parser = p.parse(tweet_text)
    cleaned_tweet = tweet_text
    hash_tags = tweet_parser.hashtags
    if hash_tags is not None:
        for hash_tag in hash_tags:
            cleaned_tweet = cleaned_tweet.replace(hash_tag.match, " ".join(wordninja.split(hash_tag.match[1:])))
    tweet_urls = tweet_parser.urls
    if tweet_urls is not None:
        for url_link in tweet_urls:
            cleaned_tweet = cleaned_tweet.replace(url_link.match, " url$$ ")
    tweet_emojis = tweet_parser.emojis
    if tweet_emojis is not None:
        for emoji in tweet_emojis:
            cleaned_tweet = cleaned_tweet.replace(emoji.match, " emoji$$ ")
    cleaned_tweet = cleaned_tweet.split("via")[0].split("|")[0].split(" - ")[0].split(" – ")[0]
    cleaned_tweet_tokens = []
    for word_token in cleaned_tweet.split(" "):
        word_token = word_token.strip().rstrip()
        if word_token.endswith("$$") or word_token in COMMON_ENGLISH_WORDS:
            cleaned_tweet_tokens.append(word_token)
        elif len(word_token) > 0:
            split_tokens = [w for w in wordninja.split(word_token) if w not in string.punctuation]
            cleaned_tweet_tokens += [token for token in split_tokens if not is_number(token)]

    cleaned_tweet = " ".join(cleaned_tweet_tokens)
    return cleaned_tweet

コード例 #4

0

ファイルを表示

ファイル: test_api.py プロジェクト: s/preprocessor

    def test_set_options(self):
        tweet = "Preprocessor now has custom #options support! https://github.com/s/preprocessor"
        p.set_options(p.OPT.URL)
        parsed_tweet = p.parse(tweet)

        self.assertIsNone(parsed_tweet.hashtags)
        self.assertIsNotNone(parsed_tweet.urls)

コード例 #5

0

ファイルを表示

ファイル: test_api.py プロジェクト: ms9306/vaccine-sentiment-analysis

    def test_set_options(self):
        tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor'
        p.set_options(p.OPT.URL)
        parsed_tweet = p.parse(tweet)

        self.assertIsNone(parsed_tweet.hashtags)
        self.assertIsNotNone(parsed_tweet.urls)

コード例 #6

0

ファイルを表示

def clean_tweet(tweet):
    tweet_clean = {key: tweet[key] for key in
                   ['created_at', 'id', 'id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str',
                    'in_reply_to_screen_name',
                    'retweet_count', 'favorite_count', 'lang']}
    if 'full_text' in tweet.keys():
        tweet_clean['text'] = tweet['full_text']
    elif 'extended_tweet' in tweet.keys():
        tweet_clean['text'] = tweet['extended_tweet']['full_text']
    else:
        tweet_clean['text'] = tweet['text']
    if 'quote_count' in tweet.keys(): tweet_clean['quote_count'] = tweet['quote_count']
    if 'reply_count' in tweet.keys(): tweet_clean['reply_count'] = tweet['reply_count']
    tweet_clean['datetime'] = datetime.fromtimestamp(parser.parse(tweet['created_at']).timestamp())
    if 'type' not in tweet.keys(): tweet_clean['type'] = tweet_type(tweet)
    if 'tweet_user_id' not in tweet.keys(): tweet_clean['tweet_user_id'] = tweet_creator(tweet)['id']
    if 'tweet_user_id_str' not in tweet.keys(): tweet_clean['tweet_user_id_str'] = tweet_creator(tweet)['id_str']
    if 'tweet_user_screen_name' not in tweet.keys(): tweet_clean['tweet_user_screen_name'] = tweet_creator(tweet)[
        'screen_name']

    tweet_clean['timestamp'] = parser.parse(tweet['created_at']).timestamp()

    tweet_clean['text_processed'] = preprocess_text(tweet_clean['text'])
    text = tweetp.parse(tweet_clean['text'])
    tweet_clean['emojis'] = min(length(text.emojis), 127)
    tweet_clean['hashtags'] = min(length(text.hashtags), 127)
    tweet_clean['urls'] = min(length(text.urls), 127)
    tweet_clean['mentions'] = min(length(text.mentions), 127)
    return tweet_clean

コード例 #7

0

ファイルを表示

def cleaning_sentence(sentence):
    stop_free = " ".join(
        [i for i in sentence.lower().split() if i not in stop])
    hashtag_value = p.parse(stop_free).hashtags
    sentence = p.clean(stop_free)
    sentence = re.sub("[^A-Za-z .]+", "", sentence)
    normalized = " ".join(lemma.lemmatize(word) for word in sentence.split())
    return normalized

コード例 #8

0

ファイルを表示

ファイル: sentence.py プロジェクト: dvarelas/Linguini

    def _parse(text):
        """
        Parses elements from tweets

        :param text:
        :return:
        """
        return preproc.parse(text)

コード例 #9

0

ファイルを表示

ファイル: json_reader.py プロジェクト: mnavarretem/ds4a_project

    def updating_columns(self):
        # Creating other columns for post
        self.posts['hashtags'] = self.posts.text.apply(lambda t: [h.match for h in p.parse(t).hashtags] if p.parse(t).hashtags else None)
        self.posts['time'] = self.posts.time.apply(datetime.fromtimestamp)

        # Creating other
        self.comments['time'] = self.comments.time.apply(datetime.fromtimestamp)
        self.comments['processed_comment'] = self.comments.text.str.lower()
        self.comments = stopwords_correction(self.comments, 'processed_comment')

コード例 #10

0

ファイルを表示

ファイル: nlp_helper.py プロジェクト: frozenflow/twitter-text-classification

def replace_hashtags(tweet):

    p.set_options(p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.URL, p.OPT.RESERVED)
    t = p.parse(tweet)
    if t.hashtags:
        for i in t.hashtags:
            tweet = tweet[:i.start_index] + ' ' + tweet[i.start_index + 1:]

    return tweet

コード例 #11

0

ファイルを表示

def tweet_cleaning(text):
    parsed = p.parse(text)
    emojis = [x.match
              for x in parsed.emojis] if not parsed.emojis is None else []
    hashtags = [x.match for x in parsed.hashtags
                ] if not parsed.hashtags is None else []
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.NUMBER)
    text = p.clean(text)
    p.set_options(p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.HASHTAG)
    text2 = p.clean(text)
    return [text, text2, emojis, hashtags]

コード例 #12

0

ファイルを表示

    def insertTweetsIntoDB(self):
        concept = "coronavirus"
        analysisID = "passif"

        preProcessing = TweetsPreProcessing()
        dirPath = "C:/Users/Raouf/PycharmProjects/PFE_SII_M2/TweetFiles/"

        allFiles = [f for f in listdir(dirPath) if isfile(join(dirPath, f))]
        print(allFiles)
        for fileName in allFiles[63:]:
            print(fileName)
            if fileName.startswith("ExtractedTweetsFor"):
                if not fileName.endswith("Loaded.json"):  # check if the file is loaded to the database or not
                    fullFileName = dirPath + fileName
                    tweetsFile = open(fullFileName, 'r', encoding="utf-8")
                    tweets = json.load(tweetsFile)

                    cpt = 0
                    for tweet in tweets['tweets']:
                        tweetLanguage = tweet['lang']
                        if tweetLanguage == "en":
                            allTweets = AllTweets()
                            tweetID = tweet['id_str']
                            parsed_tweet = p.parse(tweet['text'])
                            hashtagsList = parsed_tweet.hashtags
                            print(hashtagsList)
                            hashtagsText = ""
                            if hashtagsList!= None:
                                for hashtag in hashtagsList:
                                    print(hashtag.match)
                                    hashtagsText += " "+ str(hashtag.match)
                            text = p.clean(tweet['text'])
                            text = " ".join(re.findall('\w+', text))
                            text += ", hashtags : "+hashtagsText
                            #text = "text"
                            row = [tweetID, text]
                            row += preProcessing.getLangage(tweet['lang']) + preProcessing.getLocation(
                                tweet['user']['location']) \
                                   + preProcessing.getTime(tweet['created_at']) + preProcessing.getSentimentAnalysis(
                                tweet['text']) \
                                   + preProcessing.getSource(tweet['source'])
                            row += [analysisID, concept]
                            #print(row)
                            try:
                                allTweets.insert(row)
                                cpt += 1
                                print(cpt, "tweets ", sep=" ")
                            except:
                                print("erreur encodage")

                    print("For the file : ", fileName, ", Tweets number is : ", cpt)
                    tweetsFile.close()

コード例 #13

0

ファイルを表示

ファイル: twitter_utils.py プロジェクト: HanGuo97/TwitterBot

def _tweet_preprocessing(tweet_texts):
    """Preprocessing
        1. remove URLs (and put them in separate place)
        2. remove Emojis

    Returns the cleaned Tweet and parsed URLs
    """
    cleaned_tweet_texts = twitter_prepro.clean(tweet_texts)
    parsed_url = twitter_prepro.parse(tweet_texts).urls
    if parsed_url is not None:
        parsed_url = [u.match for u in parsed_url]

    return cleaned_tweet_texts, parsed_url

コード例 #14

0

ファイルを表示

    def get_tweet_url(tweet_text: str):
        parsed_tweet = cleaning_processor.parse(tweet_text)
        tweet_url = None
        if not parsed_tweet.urls:
            return tweet_url

        last_index = len(tweet_text)
        for url_info in parsed_tweet.urls:
            if url_info.end_index == last_index:
                tweet_url = url_info.match
                break

        return tweet_url

コード例 #15

0

ファイルを表示