Esempio n. 1
0
def classify_tweets(labeled_tweets, testing_tweets, mode="find_irrelevant"):

    X, y, vectorizer, labeled_tweets_id = generate_training_feature_matrix(
        labeled_tweets, mode=mode)

    model = random_forest(X, y)

    resulting_tweets = []
    tweets = []
    tweets_txt = []

    has_url = []
    has_username = []

    for tweet in testing_tweets:
        tweet_id = int(float(tweet['id']))
        if (tweet_id in labeled_tweets_id):
            continue
        # already classified
        if (tweet['class']):
            resulting_tweets.append(tweet)
            continue

        text = tweet['text']

        has_url.append(common.has_url(text))
        has_username.append(common.has_username(text))

        text = common.sanitize_text(text)

        tweets_txt.append(text)
        tweets.append(tweet)

    logger.info('already labeled: [%d]; testing: [%d]' %
                (len(resulting_tweets), len(tweets)))

    X_test = vectorizer.transform(tweets_txt)
    X_test = X_test.toarray()

    X_test = np.column_stack((X_test, has_username, has_url))

    y_pred = model.predict(X_test)

    #logger.info('prediction: [%d]'%(len(y_pred)))
    for c, tweet in zip(y_pred, tweets):

        if (mode == 'find_smoker'):
            tweet['class'] = c
        elif (mode == 'find_irrelevant'):
            tweet['class'] = -1 if c == 1 else 0

        resulting_tweets.append(tweet)

    return resulting_tweets
Esempio n. 2
0
def classify_tweets(labeled_tweets, testing_tweets, mode="find_irrelevant"):

    X, y, vectorizer,labeled_tweets_id = generate_training_feature_matrix(labeled_tweets, mode=mode)

    model = random_forest(X, y)

    resulting_tweets = []
    tweets = []
    tweets_txt = []

    has_url = []
    has_username = []


    for tweet in testing_tweets:
        tweet_id = int(float(tweet['id']))
        if (tweet_id in labeled_tweets_id):
            continue
        # already classified
        if (tweet['class']):
            resulting_tweets.append(tweet)
            continue

        text = tweet['text']

        has_url.append(common.has_url(text))
        has_username.append(common.has_username(text))

        text = common.sanitize_text(text)

        tweets_txt.append(text)
        tweets.append(tweet)

    logger.info('already labeled: [%d]; testing: [%d]'%(len(resulting_tweets), len(tweets)))

    X_test = vectorizer.transform(tweets_txt)
    X_test = X_test.toarray()
    
    X_test = np.column_stack((X_test, has_username, has_url))

    y_pred = model.predict(X_test)

    #logger.info('prediction: [%d]'%(len(y_pred)))
    for c, tweet in zip(y_pred, tweets):

        if (mode == 'find_smoker'):
            tweet['class'] = c
        elif (mode == 'find_irrelevant'):
            tweet['class'] = -1 if c == 1 else 0

        resulting_tweets.append(tweet)

    return resulting_tweets
Esempio n. 3
0
def make_dataset():
    if isdir("dataset"):
        rmtree("dataset")

    mkdir("dataset")

    for lang in language_names:
        raw_lang_file_name = path(WIKIDATASET_DIR_RAW, lang + "_raw")
        with open(raw_lang_file_name, "r") as f:
            raw_text = f.read()
            sanitized = sanitize_text(raw_text)
            with open(path("dataset", lang), "w") as wf:
                wf.write(sanitized)
Esempio n. 4
0
def generate_training_feature_matrix(labeled_tweets, mode='find_irrelevant'):

    # with open(labeled_transgender_identification_csv_filename, 'r', newline='', encoding='utf-8') as rf:
    #     reader = csv.DictReader(rf)

    training_tweets = []
    tweets_id = set()
    y = []
    has_url = []
    has_username = []
    for tweet in labeled_tweets:
        tweet_id = int(float(tweet['id']))

        c = determine_class(tweet, mode=mode)
        if (c == None):
            continue
        y.append(c)

        text = tweet['text']

        has_url.append(common.has_url(text))
        has_username.append(common.has_username(text))

        text = common.sanitize_text(text)

        training_tweets.append(text)
        tweets_id.add(tweet_id)

    logger.info('positive: %d; negative: %d' % (y.count(1), y.count(0)))

    # ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1, , max_features = 5000, binary = False
    # TfidfVectorizer(min_df=1)
    vectorizer = TfidfVectorizer(
        analyzer="word", ngram_range=(1, 2), min_df=1, max_features=5000
    )  #CountVectorizer(analyzer = "word", ngram_range=(1, 2), binary = True, min_df=1, tokenizer = None, preprocessor = None, stop_words = None)
    X = vectorizer.fit_transform(training_tweets)
    X = X.toarray()

    X = np.column_stack((X, has_username, has_url))

    # vocab = vectorizer.get_feature_names()

    # # Sum up the counts of each vocabulary word
    # dist = np.sum(train_data_features, axis=0)

    # # For each, print the vocabulary word and the number of times it
    # # appears in the training set
    # for tag, count in zip(vocab, dist):
    #     print(count, tag)
    return X, np.array(y), vectorizer, tweets_id
Esempio n. 5
0
def generate_training_feature_matrix(labeled_tweets, mode ='find_irrelevant'):

    # with open(labeled_transgender_identification_csv_filename, 'r', newline='', encoding='utf-8') as rf:
    #     reader = csv.DictReader(rf)

    training_tweets = []
    tweets_id = set()
    y = []
    has_url = []
    has_username = []
    for tweet in labeled_tweets:
        tweet_id = int(float(tweet['id']))

        c = determine_class(tweet, mode=mode)
        if (c == None):
            continue
        y.append(c)

        text = tweet['text']

        has_url.append(common.has_url(text))
        has_username.append(common.has_username(text))

        text = common.sanitize_text(text)

        training_tweets.append(text)
        tweets_id.add(tweet_id)

    logger.info('positive: %d; negative: %d'%(y.count(1), y.count(0)))

    # ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1, , max_features = 5000, binary = False
    # TfidfVectorizer(min_df=1)
    vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=1, max_features = 5000)#CountVectorizer(analyzer = "word", ngram_range=(1, 2), binary = True, min_df=1, tokenizer = None, preprocessor = None, stop_words = None)
    X = vectorizer.fit_transform(training_tweets)
    X = X.toarray()

    X = np.column_stack((X, has_username, has_url))

    # vocab = vectorizer.get_feature_names()

    # # Sum up the counts of each vocabulary word
    # dist = np.sum(train_data_features, axis=0)

    # # For each, print the vocabulary word and the number of times it
    # # appears in the training set
    # for tag, count in zip(vocab, dist):
    #     print(count, tag)
    return X, np.array(y), vectorizer, tweets_id
Esempio n. 6
0
def transform_json_to_txt(datafolder, txtfile):


    for root, dirs, files in os.walk(os.path.abspath(datafolder)):
        for f in files:
            if (f == 'search.json'):
                continue

            with open(os.path.join(root, f), 'r', newline='', encoding='utf-8') as json_f, open(txtfile, 'a+', newline='', encoding='utf-8') as wf:
                
                logger.info('processing: [%s]'%os.path.join(root, f))

                for line in json_f:
                    #logger.info(line)

                    try:
                        tweet = json.loads(line)
                        
                        wf.write('%s '%common.sanitize_text(tweet['text']))

                    except Exception as exc:
                        #logger.warn(exc)
                        pass
Esempio n. 7
0
def sanitize_file(filename):
    with open(filename) as f:
        text = f.read()

    with open(filename, 'wb') as f:
        f.write(sanitize_text(text))
Esempio n. 8
0
def filter_raw(raw_json_data_folder):
    tweets = []
    # texts = set()
    tweetIds = set()
    cnt_total_no_duplicate =0
    geocoded_cnt = 0
    lang_cnt = 0
    place_cnt = 0
    gps_cnt = 0
    duplicated_cnt = 0
    sanitized_cnt = 0
    total_cnt = 0
    for root, dirs, files in os.walk(os.path.abspath(raw_json_data_folder)):
        for f in files:
            if (f != 'search.json' and not f.startswith('.')):
                logger.info(os.path.join(root, f))

                with open(os.path.join(root, f), 'r') as json_file:

                    for line in json_file:

                        try:
                            if (line.startswith('{')):
                                tweet = json.loads(line)
                                total_cnt += 1
                                if (int(tweet['id']) in tweetIds):
                                    duplicated_cnt += 1
                                    continue
                                else:
                                    tweetIds.add(int(tweet['id']))
                                cnt_total_no_duplicate += 1

                                if ('lang' in tweet and 'en' != tweet['lang']):
                                    continue

                                if ('lang' not in tweet):
                                    lang, prob = langid.classify(text)
                                    if (lang != 'en'):
                                        continue
                                lang_cnt += 1

                                sanitized_text = common.sanitize_text(tweet['text']).strip()
                                if sanitized_text == '' or sanitized_text == 'RT':
                                    continue
                                sanitized_cnt += 1

                                geolocation = tweet['user']['location']

                                if 'place' in tweet and tweet['place']:

                                    if (tweet['place']['country_code'] != 'US'):
                                        continue
                                    else:
                                        geolocation = tweet['place']['full_name']

                                us_state = tug.get_state(geolocation)

                                if (not us_state):
                                    continue

                                geocoded_cnt += 1

                                tweets.append({
                                    'id': tweet['id'],
                                    'text':tweet['text'],
                                    'clean_text': sanitized_text,
                                    'place': tweet['place'] if 'place' in tweet else '',
                                    'user_location': tweet['user']['location'],
                                    'us_state': us_state,
                                    'created_at': tweet['created_at'],
                                    'username': tweet['user']['name'],
                                    'user_id': tweet['user']['id'],
                                    'is_quote_status': tweet['is_quote_status']
                                    })
                        except Exception as exc:
                            logger.info(line)
                            logger.warn('ignore: %s'%(exc))

    logger.info('total: %d; duplicate: %d; no_duplicate: %d; en: %d; sanitize_text: %d; geo: %d'%(total_cnt, duplicated_cnt, cnt_total_no_duplicate, lang_cnt, sanitized_cnt, geocoded_cnt))

    return tweets