Ejemplo n.º 1
0
def run(dataset, features, word_embedding, metrics, fname):
    if dataset.lower().startswith('f'):
        df = load_fdcl18()
    else:
        df = load_dwmw17()
    tqdm.pandas(desc='Preprocessing Progress: ')
    df['clean_tweet'] = df.tweet.progress_apply(TweetPreprocessor(normalize=['link', 'mention']).preprocess, )
    tqdm.pandas(desc='Tokenizing Progress: ')
    df['tokens'] = df.clean_tweet.progress_apply(TweetTokenizer().tokenize)
    # #
    # Feature Extraction
    # tfidf_pipeline
    ff = []
    if 'tfidf_vectorizer' in features:
        tfidf_kwargs = dict(
            tokenizer=TweetTokenizer().tokenize,
            stop_words=stopwords,
            min_df=.0025,
            max_df=0.25,
            ngram_range=(1, 3)
        )
        ff += [('tfidf_vectorizer', TfidfVectorizer(**tfidf_kwargs), 'clean_tweet')]
    # framenet_pipeline
    if 'framenet_pipeline' in features:
        count_vectorizer = ('count_vectorizer', CountVectorizer())
        truncated_svd = ('truncated_svd', TruncatedSVD(algorithm='randomized', n_components=10))
        ff += [('framenet_pipeline', Pipeline([count_vectorizer, truncated_svd]), 'framenet')]
    # mean_embedding
    if 'mean_embedding' in features:
        ff += [('mean_embedding', mean_embedding(word_embedding), 'tokens')]
    # hatebase_vectorizer
    if 'hatebase_vectorizer' in features:
        ff += [('hatebase_vectorizer', HatebaseVectorizer(features=features['hatebase_vectorizer']), 'clean_tweet')]
    # transfer_vectorizer
    if 'transfer_vectorizer' in features:
        hyper_params = features['transfer_vectorizer']
        hyper_params['module'] = TextCNN
        hyper_params['corpus'] = df.tokens
        hyper_params['word_vectors'] = word_embedding
        # """ # Cross-validate and save predictions
        args = [NeuralNetClassifier, hyper_params, ['conv_%i' % i for i in range(3)], False]
        ff += [('transfer_vectorizer', TransferVectorizer(*args), 'tokens')]
    # # Estimator
    pipeline = Pipeline([('column_transformer', ColumnTransformer(ff)), ('clf', LinearSVC())])
    # # Evaluation (Cross Validation)
    # """ # Cross-validate and save predictions
    cv = CrossValidator(pipeline, n_splits=5, scoring=metrics)
    df['predictions'], cv_results = cv.cross_val_predict(df, df.label, return_scores=True)
    # """ Print Scores
    pprint({'dataset': dataset, 'features': features})
    pprint(cv_results)
    scores = {}
    for scorer in metrics:
        scores[scorer] = ['%.2f' % (np.average(cv_results[scorer]) * 100) + ',']
    pprint(scores, type='table')
    # """ Save Predictions #
    df.to_excel(scratch_path('predictions_%s_%s.xlsx' % (dataset, fname)))
Ejemplo n.º 2
0
def tokenize_tweets(tweet_dict):
    tokenized_tweets = {}
    tknzr = TweetTokenizer()
    for k, v in tweet_dict.iteritems():
        tokenized_tweet = tknzr.tokenize(v)
        tokenized_tweets[k] = tokenized_tweet
    return tokenized_tweets
Ejemplo n.º 3
0
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)

    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False,
                               strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in emoticons and  # remove emoticons
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean
Ejemplo n.º 4
0
class PartsOfSpeechExtractor(BaseEstimator, TransformerMixin):

    IGNORE_TAGS = ['PUNCT', 'CCONJ']
    _vectorizer = None
    _tokenizer = TweetTokenizer(reduce_len=True)
    _pos_helper = PartsOfSpeechHelper()

    def __init__(self):
        pass

    def transform(self, data, y=None):
        result = []

        for tweet in data:
            result.append(self.pos_tag(tweet))

        if self._vectorizer == None:
            self._vectorizer = DictVectorizer(sparse=False)
            self._vectorizer.fit(result)

        return self._vectorizer.transform(result)

    def pos_tag(self, tweet):
        tokens = self._tokenizer.tokenize(tweet)
        pos_tweet = self._pos_helper.pos_tag(tokens)
        return Counter([t for w, t in pos_tweet if t not in self.IGNORE_TAGS])

    def fit(self, df, y=None):
        return self
Ejemplo n.º 5
0
Archivo: util.py Proyecto: gsubp/taia
def ler_csv(csv_file):
    base = list()
    with open(csv_file, 'r', encoding='utf-8') as csv_file:
        try:
            reader = csv.reader(csv_file, delimiter='|')

            for row in reader:
                texto = TweetTokenizer().tokenize(row[4])

                for t in texto:
                    if 'https' in t:
                        texto.remove(t)
                    if '#' in t:
                        texto.remove(t)
                    if '@' in t:
                        texto.remove(t)
                    t = t.lower()

                for i in range(len(texto)):
                    t = remove_acento(texto[i])
                    texto[i] = t
                base.append(tuple([texto, str(row[0]).replace('\ufeff', '')]))

        except IOError:
            pass
    return base
Ejemplo n.º 6
0
    def removeHighAndLowFrequencyWords(self, lines, percentage=0.4):
        tk = TweetTokenizer()
        dictionary = OrderedDict()

        # create dictionary
        for line in lines:
            l = tk.tokenize(self.normalizeSentence(line))
            self.lines.append(l)
            for token in l:
                if len(token) > 1 or re.search('\w', token):
                    if dictionary.get(token) is None:
                        dictionary[token] = 1
                    else:
                        dictionary[token] += 1

        # remove high frequency and low frequency words
        dictionary = sorted(dictionary.items(),
                            key=operator.itemgetter(1),
                            reverse=False)

        while dictionary[0][1] < 5:
            del dictionary[0]

        index = math.floor(dictionary.__len__() * percentage)
        for i in range(index):
            del dictionary[0]
            del dictionary[-1]
        self.dictionary = dictionary
Ejemplo n.º 7
0
def token(X_train, X_test):
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    x_train = []
    word_dict = {}
    word_index = 1

    for doc in X_train:
        word_seq = []
        for word in tknzr.tokenize(doc):
            if word not in word_dict:
                word_dict[word] = word_index
                word_index += 1
            word_seq.append(word_dict[word])
        x_train.append(word_seq)

    x_train = sequence.pad_sequences(x_train, maxlen=200, padding='post')
    word_dict['unknown-words-in-test'] = 0

    x_test = []
    for doc in X_test:
        word_seq = []
        for word in tknzr.tokenize(doc):
            if word in word_dict:
                word_seq.append(word_dict[word])
            else:
                word_seq.append(0)
        x_test.append(word_seq)

    x_test = sequence.pad_sequences(x_test, maxlen=200, padding='post')

    return x_train, x_test, word_dict
Ejemplo n.º 8
0
def cleanText(x):
    # x = json.loads(x)
    # tmp = x
    # x = x["text"]
    #
    if len(x) != 0:
        #Unicode remover
        regex03 = 'u[a-zA-Z0-9]{4}'
        k = re.sub(regex03, '', str(x))
        text = re.sub(r"http\S+", "", str(k))
        text = text.decode('utf-8')

        # removes emoticons and other symbols
        try:
            # UCS-4
            highpoints = re.compile(u'[\U00010000-\U0010ffff]')
        except re.error:
            # UCS-2
            highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
        text = highpoints.sub('', text)

        tknzr = TweetTokenizer(reduce_len=True)
        a = tknzr.tokenize(text)

        # Pnctuations remover
        c = [i for i in a if i not in removal_list]
        c = " ".join(c)
        c = [i for i in a if i.isalnum()]  # not in removal_list]
        c = " ".join(c)
        # c = {"id" : tmp["id"], "text" : c}
        return c
Ejemplo n.º 9
0
def normalize_tweet(tweet):
    # convert the tweet to lower case
    tweet.lower()
    # convert all urls to sting "URL"
    tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet)

    # correct all multiple white spaces and punctuations to a single white space/punctuation
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    tweet = re.sub(r'[\s]+', ' ', tweet)
    tweet = re.sub(r'\!{2,}', '!', tweet)

    # convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # Extracting words(tokens) from the tweet
    twt_token = TweetTokenizer(strip_handles=True)
    token = twt_token.tokenize(tweet)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    word_list = [tkn for tkn in token if tkn not in stop_words]

    # Using Rule Based Stemmer to find word stems
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in word_list]

    # Creating a sentence from the stems
    norm_tweet = " ".join(stems)

    return norm_tweet
Ejemplo n.º 10
0
 def load(self):
     # Load dictionary
     inBody = False
     with open(self.dict_path, 'r', encoding='utf-8') as r:
         next(r)
         for line in r:
             if inBody:
                 segs = line.strip().split('\t')
                 token = segs[0]
                 for cate_id in segs[1:]:
                     self.token_category[token].append(int(cate_id))
             else:
                 if line.startswith('%'):
                     inBody = True
                 else:
                     self.category_num += 1
     tokenizer = TweetTokenizer()
     with open(self.data_path, 'r', encoding='utf-8') as r:
         for line in r:
             tid, tweet, _ = line.rstrip().split('\t')
             tokens = tokenizer.tokenize(tweet)
             tokens = [t.replace('#', '').lower() for t in tokens]
             category_count = [0] * self.category_num
             for token in tokens:
                 for i in range(min(len(token), 5)):
                     if token[:-i] in self.token_category:
                         for cate in self.token_category[token[:-i]]:
                             category_count[cate - 1] += 1
                         break
             if len(tokens) > 0:
                 category_count = [c / len(tokens) for c in category_count]
             self.tid_vector[tid] = torch.FloatTensor(category_count)
Ejemplo n.º 11
0
def render_wordcloud(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += [
                w for w in tknzr.tokenize(sent.strip())
                if w.lower() not in stopwords_en
            ]
        texts.append(tokens)
    corpus = nltk.TextCollection(texts)
    corpus.collocations(100)
    # noinspection PyProtectedMember
    results = {
        'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)],
        'collocations': corpus._collocations,
    }
    view = render_template('./templates/search/results_wordcloud.html',
                           form=form,
                           results=results,
                           **kwargs)
    session.close()
    return view
Ejemplo n.º 12
0
 def text_total_counts(self):
     with codecs.open(self._filepath + ".txt", "r", "latin-1") as f:
         lines = f.read()
         tknzr = TweetTokenizer()
         tknz_lines = tknzr.tokenize(lines)
         self._totalcount = len(tknz_lines)
     return self._totalcount
Ejemplo n.º 13
0
 def ngrams(self):
     #        name = re.findall("\w+$",self._filepath)
     name = str(input("choose a seed: "))
     with codecs.open(self._filepath + ".txt", "r", "latin-1") as f:
         lines = f.read()
         tknzr = TweetTokenizer()
         tknz_lines = tknzr.tokenize(lines)
     emptylist = []
     maxhistory = int(input("Choose n for ngram, preferably 2 or 3: "))
     for i in range(2, maxhistory + 1):
         emptylist += nltk.ngrams(tknz_lines, i)
     cfd = ConditionalFreqDist([(tuple(a), b) for *a, b in emptylist])
     seed = [name]
     for i in range(100):
         for j in range(maxhistory - 1, 0, -1):
             if tuple(seed[-j:]) in cfd:
                 valuesum = sum(cfd[tuple(seed[-j:])].values())
                 value = random.randint(0, valuesum)
                 for key in cfd[tuple(seed[-j:])].keys():
                     value -= cfd[tuple(seed[-j:])][key]
                     if value <= 0:
                         seed.append(key)
                         break
                 break
             else:
                 continue
     return seed
     print(seed)
     return
    def boolenModel(self, freq, onlyfiles):

        self.comboBox_4.clear()
        self.comboBox_4.addItem(' ')
        requete = self.plainTextEdit_2.toPlainText()
        requete = requete.lower()

        req = TweetTokenizer().tokenize(requete)

        for file in onlyfiles:

            reqtemp = []
            for mot in req:
                mot.lower()
                if (mot in ['and', 'or', '(', ')', 'not']):
                    reqtemp.append(mot)
                    reqtemp.append(' ')
                else:
                    listfile = self.indexmotSimple(mot)
                    if (file in listfile):
                        reqtemp.append('1')
                        reqtemp.append(' ')
                    else:
                        reqtemp.append('0')
                        reqtemp.append(' ')
            evaluation = eval(''.join(reqtemp))
            if (evaluation == 1):

                self.comboBox_4.addItem(file)
Ejemplo n.º 15
0
def main():
    HOME_DIR = "semeval_parsed"
    np.random.seed(123)
    input_fname = '200M'
    embedding = 'custom'
    type = '200M'
    ndim = 52

    data_dir = HOME_DIR + '_' + input_fname
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))

    tknr = TweetTokenizer()
    alphabet = cPickle.load(open(fname_vocab))
    words = alphabet.keys()
    tok_words = {}
    words = []
    for word, idx in alphabet.iteritems():
        tok_word = tknr.tokenize(word.decode('utf-8'))
        tok_words[idx] = tok_word
        words.extend(tok_word)

    print len(tok_words)
    print len(words)
    print "Vocab size", len(alphabet)
    fname, delimiter, ndim = (
        'embeddings/updated_embeddings_custom_200M'.format(type, str(ndim)),
        ' ', ndim)

    word2vec = load_glove_vec(fname, words, delimiter, ndim)

    print 'len', len(word2vec)
    ndim = len(word2vec[word2vec.keys()[0]])
    print 'ndim', ndim

    random_words_count = 0
    vocab_emb = np.zeros((len(alphabet) + 1, ndim), dtype='float32')

    for idx, tok_word in tok_words.iteritems():
        isrand = 1
        word_vec = np.zeros(ndim)
        for tok in tok_word:
            if tok in word2vec.keys():
                word_vec += word2vec[tok]
                isrand = 0

        if isrand:
            word_vec = np.random.uniform(-0.25, 0.25, ndim)
            random_words_count += 1
        vocab_emb[idx] = word_vec.astype(np.float32) / len(tok_word)
    print "Using zero vector as random"
    print 'random_words_count', random_words_count

    svd = TruncatedSVD(n_components=5)
    vocab_emb = svd.fit_transform(vocab_emb).astype(np.float32)
    print vocab_emb.shape
    fname = 'embeddings/smiley_tweets_embedding_{}'.format('topic')
    outfile = os.path.join(data_dir,
                           'emb_{}.npy'.format(os.path.basename(fname)))
    print outfile
    np.save(outfile, vocab_emb)
Ejemplo n.º 16
0
def search():

    # validate screen_name
    screen_name = request.args.get("screen_name", "")
    if not screen_name:
        return redirect(url_for("index"))
    positives = os.path.join(sys.path[0], "positive-words.txt")
    negatives = os.path.join(sys.path[0], "negative-words.txt")
    # get screen_name's tweets
    tweets = helper.get_user_timeline(screen_name)

    # TODO
    analyzer = Analyzer(positives, negatives)

    s = tweets
    s = str(s)
    # analyze word
    tw = TweetTokenizer()
    #print(tw.tokenize(s))
    p = tw.tokenize(s)
    score = analyzer.analyze2(p)

    positive = float(score[0])
    if score[1] < 0:
        score[1] = -score[1]
        negative = float(score[1])
    else:
        negative = float(score[1])
    neutral = score[2]

    # generate chart
    chart = helper.chart(positive, negative, neutral)

    # render results
    return render_template("search.html", chart=chart, screen_name=screen_name)
Ejemplo n.º 17
0
def preprocess(docs, sentiments, n):
    """
    Filters <br> tags, URLs and twitter handles
    :param docs: Document list
    :param sentiments: Sentiment list
    :param n: Number of documents
    :return: Processed corpus
    """
    processed_tweets = list()
    processed_sentiments = list()
    tok = TweetTokenizer()


    for i, doc in enumerate(docs):
        if i > n:
            return processed_tweets, processed_sentiments

        if not pd.isna(sentiments[i]):
            #print(doc)
            #print(type(doc))
            #tokens = list(filter(lambda a: not a.startswith('<br' or '@' or 'http'), tok.tokenize(doc))) #tokenize and filter out <br>
            tokens = tok.tokenize(doc)
            tweet_new = ' '.join(tokens)
            processed_tweets.append(tweet_new)
            processed_sentiments.append(str(sentiments[i]))




    return processed_tweets, processed_sentiments
Ejemplo n.º 18
0
def clean_tweets(classifier, df, stop_words):
    tknzr = TweetTokenizer()
    for i in df.iterrows():
        # print('tweet: '+df['tweet_text'][i[0]])
        tokens = tknzr.tokenize(
            i[1]['tweet_text'])  # using NLTK tweet tokenizer

        custom_tokens = remove_noise(tokens, stop_words)
        df['tokens'][i[0]] = custom_tokens  # need to fix this warning later
        # SettingWithCopyWarning:
        # A value is trying to be set on a copy of a slice from a DataFrame

        # See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

        # grabs the current row: df.loc[i[0]]
        # grabs the tokens column of the current row: df.loc[i[0]]['tokens']
        # this is a python object of type array: df.loc[df.id == i[0], 'tokens']

        # df.loc[df.id == i[0], 'tokens'] = remove_noise(tokens, stop_words)

        score = classifier.classify(
            dict([token, True] for token in custom_tokens))
        df['sentiment'][i[0]] = score

    return df
Ejemplo n.º 19
0
def tokenize_tweets(texts, segment=True, segment_vocab=None):
    tknzr = TweetTokenizer()
    token_x = [tknzr.tokenize(t) for t in texts]
    if not segment:
        return token_x

# if need to segment
    wordsegment.load()
    tokens = []
    for line in token_x:
        tokens += line
    counter = Counter(tokens)
    # identify segment-able words
    segmented = {}
    for word in counter:
        if word not in segment_vocab:
            segment = wordsegment.segment(word)
            if len(segment) > 1:
                segmented[word] = segment
    # reconstruct the list
    _token_x = []
    for line in token_x:
        _line = []
        for token in line:
            if token in segmented.keys():
                _line += segmented[token]
            else:
                _line.append(token)
        _token_x.append(_line)
    return _token_x
Ejemplo n.º 20
0
def main(model_file, out_tsv_file, out_labels_file, data_file_path, vocab_file_path):
    model = load_keras_model(model_file)

    uid = uuid4().hex
    os.makedirs(uid)

    samples = load_samples(data_file_path)
    train_samples, val_samples = train_val_split(samples)
    val_provider = TripletProvider(val_samples, shuffle=True)

    tokenizer = TweetTokenizer()
    tokenized_samples = [tokenizer.tokenize(sample.text) for sample in train_samples]

    vocabulary = joblib.load(vocab_file_path)
    vocabulary.fit((c for tokens in tokenized_samples for token in tokens for c in token))

    transformer = HierarchicalTripletTransformer(vocabulary)

    max_document_length, max_token_length = get_max_length(tokenized_samples)
    val_generator = TripletBatchGenerator(val_provider, transformer, max_document_length, max_token_length,
                                          len(vocabulary), 1)

    vectors = []
    labels = []
    for sample in val_generator:
        X, y, triplet = sample
        for xi in X:
            prediction = model.predict(xi)
            vectors.append(prediction)
            labels.append(sample.text)

    model.predict()
    np.savetxt('vectors_out.tsv', vectors, delimiter='\t')
Ejemplo n.º 21
0
    def convertDataToVec(self, data, labels, batchSize=5000):
        if data.__len__() - self.indexTracking < batchSize:
            batchSize = data.__len__() - self.indexTracking
            self.batchFlag = True

        clf = Word2Vec.load("w2v.model")
        d = np.array([])
        counts = 0
        for line in data[self.indexTracking:]:
            if counts == batchSize:
                break
            counts += 1
            tmp = np.array([0] * 300)
            tk = TweetTokenizer()
            l = tk.tokenize(self.normalizeSentence(line))
            count = 0
            for w in l:
                count += 1
                try:
                    s = clf.wv.get_vector(w)
                    s = np.array(s)
                    tmp = np.add(tmp, s)
                except:
                    continue

            tmp = tmp / count
            d = np.concatenate((d, tmp))

        l = self.convertLabelToVec(labels, batchSize)
        self.indexTracking += batchSize

        return l, d
def run(dataset, hyperparameters, metrics, fname=None):
    # # Load Resources
    word2vec = None
    if hyperparameters['model'] != 'rand':
        word2vec = load_word2vec()
    # # Load Dataset
    df = load_dataset(dataset[0], **dataset[1])
    # # Preprocess
    df['clean_tweets'] = df.tweet.apply(
        TweetPreprocessor(normalize=['link', 'mention']).preprocess)
    df['tokens'] = df.clean_tweets.apply(TweetTokenizer().tokenize)
    X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(
        df.tokens, df.label)
    # # Train
    clf = NeuralNetClassifier(module=TextCNN,
                              corpus=df.tokens,
                              word_vectors=word2vec,
                              metrics=metrics,
                              **hyperparameters)
    clf.fit(X_train, y_train, validation_data=(X_dev, y_dev))
    # # Predict
    y_pred = clf.predict(X_test)
    # # Evaluate
    pprint(
        dict(dataset=dataset,
             hyperparameters=hyperparameters,
             scores={
                 scorer: get_score_func(scorer)(y_test, y_pred)
                 for scorer in metrics
             }))
    # # Save to file
    X_test['pred'] = y_pred
    X_test.to_excel(scratch_path('predictions_%s.xlsx' % fname))
Ejemplo n.º 23
0
def main():
    # x, y = load_dataset("datasets/sentiment_uci/yelp_labelled.txt")
    x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"])

    stopwords = set()
    with open('../stopwords.txt', 'r') as f:
        for w in f:
            stopwords.add(w)

    tok = TweetTokenizer()
    stemmer = EnglishStemmer()
    vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True, binary=True, preprocessor=stemmer.stem,
                                 tokenizer=tok.tokenize, ngram_range=(1, 2))

    accu_p = np.zeros(shape=(2,))
    accu_r = np.zeros(shape=(2,))
    accu_f = np.zeros(shape=(2,))
    accu_a = 0.0
    folds = 10
    for train_idx, test_idx in StratifiedKFold(y=y, n_folds=folds, shuffle=True):
        train_x, train_y = x[train_idx], y[train_idx]
        test_x, test_y = x[test_idx], y[test_idx]

        cls = svm.NuSVC(nu=0.5, kernel='rbf')

        # train
        train_x = vectorizer.fit_transform(train_x).toarray()

        cls.fit(train_x, train_y)

        # test
        test_x = vectorizer.transform(test_x).toarray()

        pred_y = cls.predict(test_x)

        # evaluate
        p, r, f, _ = precision_recall_fscore_support(test_y, pred_y)
        a = accuracy_score(test_y, pred_y)
        accu_p += p
        accu_r += r
        accu_f += f
        accu_a += a

        print("Evaluating classifier:")
        print("\tAccuracy: {}".format(a))
        print("\tPrecision[0]: {}".format(p[0]))
        print("\tPrecision[1]: {}".format(p[1]))
        print("\tRecall[0]: {}".format(r[0]))
        print("\tRecall[1]: {}".format(r[1]))
        print("\tF1-score[0]: {}".format(f[0]))
        print("\tF1-score[1]: {}".format(f[1]))

    print("Average evaluation")
    print("\tAccuracy: {}".format(accu_a / folds))
    print("\tPrecision[0]: {}".format(accu_p[0] / folds))
    print("\tPrecision[1]: {}".format(accu_p[1] / folds))
    print("\tRecall[0]: {}".format(accu_r[0] / folds))
    print("\tRecall[1]: {}".format(accu_r[1] / folds))
    print("\tF1-score[0]: {}".format(accu_f[0] / folds))
    print("\tF1-score[1]: {}".format(accu_f[1] / folds))
def preprocess_text(tweet_text):
    tweet_tokenizer = TweetTokenizer()

    tokens = [
        token.lower().lstrip("@").lstrip("#")
        for token in tweet_tokenizer.tokenize(tweet_text)
    ]
    tokens_no_contra = [
        contractions[token].split() if token in contractions else [token]
        for token in tokens
    ]
    flat_list = [item for sublist in tokens_no_contra for item in sublist]
    tokens_semi_final = [
        token for token in flat_list
        if token not in punctuations and token not in en_stopwords
    ]
    final_t = [
        token.replace("'s", "") for token in tokens_semi_final
        if not re.match('((www\.[^\s]+)|(https?://[^\s]+))', token)
    ]

    text = []
    wnl = WordNetLemmatizer()
    tagged = pos_tag(final_t)
    for word, tag_prior in tagged:
        tag = nltk_tag_to_wordnet_tag(tag_prior)
        word = "not" if word == "n't" else word
        if tag:
            text.append(wnl.lemmatize(word.lower(), tag))
        else:
            text.append(wnl.lemmatize(word.lower()))

    return text
Ejemplo n.º 25
0
def tokenize_tweets(input_file_name, out_file_name,type_file):
    outf = open(out_file_name,'w')
    infn = open(input_file_name,'r')
    tknzr = TweetTokenizer()
   
    while 1:                       
          lines = infn.readlines(100000)                         
          if not lines:                    
             break                            
          for line in lines:
              # ignore blank lines                 
              if not line.strip():
                 continue   
              if type_file =='split':
                 tweetId, startPos, endPos, mention, screenName,tweet,mediaURL = line.strip().split('\t')  # test,dev,train tokenization 
              elif type_file =='kb':             
                  x, y,tweet,mediaURL = line.strip().split('\t')  # timeline tokenization
              else:
                  sys.exit("set type param from {split,kb}")

              tweet = tknzr.tokenize(str(tweet))            
#             if not 6 < len(tweet) < 110:
#                    continue
              if len(tweet) < 6:
                 continue
              tweet = preprocess_tweet(' '.join(tweet))
#             out_fs.write(id+'\t'+timestamp+'\t'+username+'\t'+tweet+'\n')                
#             out_fs.write( str(tweetId) + '\t' + str(startPos) + '\t' + str(endPos) +'\t' + mention +  '\t'+ str(screenName) + '\t' + str(tweet) + '\t' + str(mediaURL) + '\n')
              outf.write(str(tweet) +'\n')  
Ejemplo n.º 26
0
def pre_process():
    data = []
    emotions = []
    word_dict = {}
    sentence = []

    with open('../data/text_emotion.csv') as csvDataFile:
        csv_reader = csv.reader(csvDataFile)
        for row in csv_reader:
            emotions.append(row[1])
            data.append(row[3])

    tknzr = TweetTokenizer()
    for d in data:
        tokens = tknzr.tokenize(d)
        sentence.append(tokens)

        # print(tokens)

    for s in sentence:
        for i in s:
            if i.lower() in word_dict:
                word_dict[i.lower()] += 1
            else:
                word_dict[i.lower()] = 1

    return [word_dict, sentence, emotions]
Ejemplo n.º 27
0
def getTopics(tweets, count=10):
    stop_words = set(stopwords.words("english"))
    stop_words.update([
        "rt", "anybody", "anyone", "anything", "everybody", "everyone",
        "everything", "nobody", "noone", "nothing", "somebody", "someone",
        "something", "thing", "things"
    ])

    tknzr = TweetTokenizer()

    trimmed_tweets = [[
        word for (word, pos) in pos_tag(tknzr.tokenize(tweet)) if len(word) > 1
        and word.casefold() not in stop_words and pos[0] == 'N'
    ] for tweet in tweets]

    t = trimmed_tweets
    t[:] = [[
        word.lower() if not match(r"\b[A-Z]{2,}\b", word) else word
        for word in wordlist
    ] for wordlist in trimmed_tweets]

    trimmed_tweets_counts = [Counter(wordlist) for wordlist in t]

    topics = Counter()
    for c in trimmed_tweets_counts:
        topics.update(c)

    # Counter dict `topics` can be very important. We can put preferences on twitter handles
    # they are complete nouns as opposed to parts of broken-down noun phrases like "graphic"
    # and "novel" which individually do not give the idea of the original phrase.
    # A large number of handles might mean they are connected to their followers better, interactive, etc.

    return topics.most_common(count)
Ejemplo n.º 28
0
def tokenize_with(kwargs):
    tokenizer = TweetTokenizer(**kwargs)

    def tweet_tokenizer(data):
        return [' '.join(tokenizer.tokenize(tweet)) for tweet in data]

    return tweet_tokenizer
Ejemplo n.º 29
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = (
        'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows, ':', nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train, test, dev, test15, smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen, 'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word, tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it % 10) == 0:
                print 'Progress:', it
            it += 1

    f = open(fout, 'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
Ejemplo n.º 30
0
def tokenize_tweets(filename, dest_folder):
    basename = os.path.basename(filename)
    dest = os.path.join(dest_folder, basename + '.tok')
    print("processing %s" % basename)
    tknzr = TweetTokenizer()
    with codecs.open(dest, 'w', "utf-8") as out_fs:
        with open(filename, 'r', encoding="utf-8") as in_fs:
            for line in in_fs: