Beispiel #1
0
def stem(list):
  ps = nltk.PorterStemmer()
  stemmedList = []
  for word in list:
    stemmedList.append(ps.stem(word))
  return stemmedList
Beispiel #2
0
    lamda = 0.6
    elmo_layers_weight = [0.0, 1.0, 0.0]
elif (database == "Duc2001"):
    data, labels = fileIO.get_duc2001_data()
    lamda = 1.0
    elmo_layers_weight = [1.0, 0.0, 0.0]
else:
    data, labels = fileIO.get_semeval2017_data()
    lamda = 0.6
    elmo_layers_weight = [1.0, 0.0, 0.0]

#download from https://allennlp.org/elmo
options_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

porter = nltk.PorterStemmer()  #please download nltk
ELMO = word_emb_elmo.WordEmbeddings(options_file, weight_file, cuda_device=0)
SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=lamda, database=database)
en_model = StanfordCoreNLP(
    r'E:\Python_Files\stanford-corenlp-full-2018-02-27',
    quiet=True)  #download from https://stanfordnlp.github.io/CoreNLP/

try:
    for key, data in data.items():

        lables = labels[key]
        lables_stemed = []

        for lable in lables:
            tokens = lable.split()
            lables_stemed.append(' '.join(porter.stem(t) for t in tokens))
Beispiel #3
0
def stemmer(text):
    ps = nltk.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
import PyPDF2
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.corpus import stopwords

stop_words = set(stopwords.words(
    'english'))  # Getting the stop word list for english corpus
stem = nltk.PorterStemmer()  # Initialising the stemmer


# Read the pdf file
def read_pdf(filename):
    content = ""
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    for x in range(0, pdfReader.numPages):
        pageObj = pdfReader.getPage(x)
        content += (pageObj.extractText()).strip()
    return content


# Do prepossessing with the text blob by splitting it , tokenizing it and doing Part of speech tagging
def process_data(pdf_content):
    sent_tok = pdf_content.split("\n")  # split different sentences
    word_tok = [nltk.word_tokenize(w)
                for w in sent_tok]  #split each sentence to its tokens
Beispiel #5
0
def stem(desc, stemmer=None):
    stemmer = stemmer or nltk.PorterStemmer()
    return ' '.join(stemmer.stem(w) for w in nltk.word_tokenize(desc))
Beispiel #6
0
def nameLDArepresentation(textfile, dictfile, representationfile):
    appname = []
    with open(textfile, 'r') as fin:
        for line in fin:
            line = json.loads(line)
            appname.append(line['name'])
    print("Preprocessing Name Text done!")

    tokenizer = RegexpTokenizer(r'\w+')
    appnametext = []
    count = 0
    for namestr in appname:
        wordlist = tokenizer.tokenize(namestr)
        wordlist_rmstopword = [
            word for word in wordlist if word not in stopwords.words('english')
        ]
        for i in range(len(wordlist_rmstopword)):
            wordlist_rmstopword[i] = nltk.PorterStemmer().stem_word(
                wordlist_rmstopword[i])
        Lnumber = [
            word for word in wordlist_rmstopword if re.match(r'\d+$', word)
        ]
        Lothers = [
            word for word in wordlist_rmstopword if re.match(r'^_+', word)
        ]
        wordlist_rmLnumber = [
            word for word in wordlist_rmstopword if word not in Lnumber
        ]
        wordlist_rmLothers = [
            word for word in wordlist_rmLnumber if word not in Lothers
        ]
        appnametext.append(wordlist_rmLothers)
        if count % 1000 == 0:
            print count
        count = count + 1

    # appname.clear()
    print("Obtain Name Wordlist done!")

    name_dict = corpora.Dictionary(appnametext)
    once_ids = [
        wordid for wordid, docfreq in name_dict.dfs.items() if docfreq == 1
    ]
    name_dict.filter_tokens(once_ids)
    name_dict.save_as_text(dictfile)
    print("Obtain Name dictionary done!")

    corpus_tf = [name_dict.doc2bow(eachappname) for eachappname in appnametext]

    KList = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    for k in KList:
        #lda transform
        lda = models.LdaModel(corpus=corpus_tf,
                              id2word=name_dict,
                              num_topics=k,
                              minimum_probability=0)
        # print 'lda:',type(lda)
        # lda.save()
        corpus_lda = lda[corpus_tf]
        print("Obtain Name lsi representation done!")
        featurefile = representationfile + str(k) + ".txt"
        fout = open(featurefile, 'w')
        for doc in corpus_lda:
            line = []
            for i in range(k):
                line.append(0)
            for (fid, fvalue) in doc:
                line[fid] = fvalue
            for item in line:
                t = fout.write(str(item) + '\t')
            t = fout.write('\r\n')
        fout.close()
Beispiel #7
0
def str_stemmer(s):
    return " ".join([nltk.PorterStemmer().stem_word(word) for word in s.lower().split()])
sent = sents[4]
print(sent)
# Segment the words in sentence with a "tokenizer"
tokens = nltk.word_tokenize(sent)
tokens

# Normalize the tokens
normalized_tokens = [t.lower() for t in tokens]
print('\nNormalized tokens:\n', normalized_tokens)

# Build the vocabulary
vocabulary = sorted(set(normalized_tokens))
print('\nThe vocabulary:\n', vocabulary)

#Stemming
print([nltk.PorterStemmer().stem(t) for t in tokens])

#For example we can show that the stemmer works:
example_tokens = ['lie', 'lied', 'lay', 'lies', 'lying']
stemmed_tokens = [nltk.PorterStemmer().stem(t) for t in example_tokens]
print(stemmed_tokens)
#Lets look at some statistics for the words
ltokens = [nltk.word_tokenize(doc) for doc in news.data[:500]]
# convert list of list of tokens (ltokens) into a list of tokens
import itertools
tokens_all = list(itertools.chain.from_iterable(ltokens))
# convert list of tokens to nltk text object
x = nltk.Text(t.lower() for t in tokens_all)

print("The text comprises %d normalized tokens." % len(x))
print("The first few are", x[:10])
Beispiel #9
0
 def __init__(self, tokens):
     """ Constructor. """
     self.stemmer = nltk.PorterStemmer()
     self.tokens = tokens
def nameDoc2vec_representation(textfile, dictfile, representationfile):
    appname = []
    with open(textfile, 'r') as fin:
        for line in fin:
            line = json.loads(line)
            appname.append(line['name'])
    print("Preprocessing Name Text done!")

    tokenizer = RegexpTokenizer(r'\w+')
    appnametext = []
    count = 0
    for namestr in appname:
        wordlist = tokenizer.tokenize(namestr)
        wordlist_rmstopword = [
            word for word in wordlist if word not in stopwords.words('english')
        ]
        # wordlist_rmstopword=wordlist
        for i in range(len(wordlist_rmstopword)):
            wordlist_rmstopword[i] = nltk.PorterStemmer().stem_word(
                wordlist_rmstopword[i])
        Lnumber = [
            word for word in wordlist_rmstopword if re.match(r'\d+$', word)
        ]
        Lothers = [
            word for word in wordlist_rmstopword if re.match(r'^_+', word)
        ]
        wordlist_rmLnumber = [
            word for word in wordlist_rmstopword if word not in Lnumber
        ]
        wordlist_rmLothers = [
            word for word in wordlist_rmLnumber if word not in Lothers
        ]
        appnametext.append(wordlist_rmLothers)
        if count % 1000 == 0:
            # print(str(count)+"...",end='')
            print count
        count = count + 1

    # appname.clear()
    print("Obtain Name Wordlist done!")

    name_dict = corpora.Dictionary(appnametext)
    once_ids = [
        wordid for wordid, docfreq in name_dict.dfs.items() if docfreq == 1
    ]
    name_dict.filter_tokens(once_ids)
    name_dict.save_as_text(dictfile)
    print("Obtain Name dictionary done!")

    # corpus_tf=[name_dict.doc2bow(eachappname) for eachappname in appnametext]
    # #tfidf transform
    # tfidf=models.TfidfModel(corpus_tf)
    # corpus_tfidf=tfidf[corpus_tf]

    KList = [
        50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000
    ]

    # documents = []
    # for i in range(len(appnametext)):
    #     string = "SENT_" + str(i)
    #     sentence = models.doc2vec.LabeledSentence(appnametext[i], labels = [string])
    #     documents.append(sentence)
    file = 'doc2vec/apptext.txt'
    apptext = open(file, 'w')
    for line in appnametext:
        # print line
        apptext.write(' '.join(line) + '\n')
    apptext.close()
    for k in KList:
        #lsi transform
        # lsi=models.LsiModel(corpus=corpus_tfidf,id2word=name_dict,num_topics=k)
        documents = models.doc2vec.TaggedLineDocument(file)
        doc2vec = models.Doc2Vec(documents,
                                 size=k,
                                 window=2,
                                 min_count=0,
                                 workers=4)
        featurefile = representationfile + str(k) + ".txt"
        fout = open(featurefile, 'w')
        for i in range(15282):
            # print type()
            valueList = doc2vec.docvecs[i].tolist()
            for j in range(k):

                fout.write(str(valueList[j]) + '\t')
            fout.write('\n')
        fout.close()
 def __init__(self):
     self.model = self._load_model()
     self.lemmatizer = nltk.WordNetLemmatizer()
     self.stemmer = nltk.PorterStemmer()
     self.country_fixes = {}
     self.coutries = self._read_countries()
Beispiel #12
0
def stem_with_porter(words):
    porter = nltk.PorterStemmer()
    new_words = [porter.stem(w) for w in words]
    return new_words
    try:
        user_desc = json_dict['user']['description'].replace(' ', '')
        if user_desc:
            text += ' &' + user_desc
    except:
        pass    

    return text

if __name__ == "__main__":
    data_dir = './data'  ##Setting your own file path here.

    x_filename = 'samples.txt'
    y_filename = 'labels.txt'

    porter = nltk.PorterStemmer()    #porter stemmer
    stops = set(stopwords.words('english'))
    stops.add('rt') #may add personalized stop words


    ##load and process samples
    print('start loading and process samples...')
    words_stat = {} # record statistics of the df and tf for each word; Form: {word:[tf, df, tweet index]}
    tweets = []
    bonuses = [] 
    cnt = 0
    with open(os.path.join(data_dir, x_filename)) as f:
        for i, line in enumerate(f):
            postprocess_tweet = []
            tweet_obj = json.loads(line.strip(), encoding='utf-8')
            content = tweet_obj['text'].replace("\n"," ")
Beispiel #14
0
def setup():
    nltk.download('punkt')
    nltk.download('wordnet')
    porter = nltk.PorterStemmer()
    wnl = nltk.WordNetLemmatizer()
    return [porter, wnl]
Beispiel #15
0
 def __init__(self):
     nltk.download('stopwords')
     nltk.download('punkt')
     self.stemmer = nltk.PorterStemmer()
     self.stopwords = set(
         stopwords.words('english'))  # + get_stop_words('en'))
Beispiel #16
0
    nltk, sklearn
'''

import tomotopy as tp
import nltk
from nltk.corpus import stopwords
import re
from sklearn.datasets import fetch_20newsgroups
import itertools

print('Training lda models...')
try:
    # load if trained model exist already
    mdl = tp.LDAModel.load('trained_lda_model.bin')
except:
    porter_stemmer = nltk.PorterStemmer().stem
    english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
    pat = re.compile('^[a-z]{2,}$')
    corpus = tp.utils.Corpus(
        tokenizer=tp.utils.SimpleTokenizer(porter_stemmer),
        stopwords=lambda x: x in english_stops or not pat.match(x))
    newsgroups_train = fetch_20newsgroups()
    corpus.process(d.lower() for d in newsgroups_train.data)

    mdl = tp.LDAModel(min_df=5, rm_top=30, k=20, corpus=corpus)
    mdl.train(0)

    print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
        len(mdl.docs), len(mdl.used_vocabs), mdl.num_words))
    print('Removed Top words: ', *mdl.removed_top_words)
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer = nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words
Beispiel #18
0
def stemming(tokens):
    """
    stem tokens
    """
    porter = nltk.PorterStemmer()
    return [porter.stem(t) for t in tokens]
Beispiel #19
0
def descriptionLDArepresentation(textfile, dictfile, representationfile):
    appdescription = []
    htmlcompilers = re.compile(
        r'<[^>]+>| +|=+|\?+|!+|-+|\*+|\.+|(&gt;)+|\(+|\)+|\^+|\_+|#+|\[+|\,+|(&amp;)+|\/+|\]+|:+|(&39;t)+|(&quot;)+|(&#39;)+|~+',
        re.S)
    spacecompilers = re.compile(r'\s+', re.S)
    with open(textfile, 'r') as fin:
        for line in fin:
            line = json.loads(line)
            tmp = htmlcompilers.sub(' ', line['description'])
            tmp = spacecompilers.sub(' ', tmp)
            appdescription.append(tmp)
    print("Preprocessing description Text done!")

    tokenizer = RegexpTokenizer(r'\w+')
    appdescriptiontext = []
    count = 0
    for descriptionstr in appdescription:
        wordlist = tokenizer.tokenize(descriptionstr)
        wordlist_rmstopword = [
            word for word in wordlist if word not in stopwords.words('english')
        ]
        for i in range(len(wordlist_rmstopword)):
            wordlist_rmstopword[i] = nltk.PorterStemmer().stem_word(
                wordlist_rmstopword[i])
        Lnumber = [
            word for word in wordlist_rmstopword if re.match(r'\d+$', word)
        ]
        Lothers = [
            word for word in wordlist_rmstopword if re.match(r'^_+', word)
        ]
        wordlist_rmLnumber = [
            word for word in wordlist_rmstopword if word not in Lnumber
        ]
        wordlist_rmLothers = [
            word for word in wordlist_rmLnumber if word not in Lothers
        ]
        appdescriptiontext.append(wordlist_rmLothers)
        if count % 1000 == 0:
            print count
        count = count + 1

    # appdescription.clear()
    print("Obtain Description Wordlist done!")

    description_dict = corpora.Dictionary(appdescriptiontext)
    once_ids = [
        wordid for wordid, docfreq in description_dict.dfs.items()
        if docfreq == 1
    ]
    description_dict.filter_tokens(once_ids)
    description_dict.save_as_text(dictfile)
    print("Obtain Description dictionary done!")

    corpus_tf = [
        description_dict.doc2bow(eachappdescription)
        for eachappdescription in appdescriptiontext
    ]

    KList = [600, 700, 800, 900, 1000]
    for k in KList:
        #lda transform
        lda = models.LdaModel(corpus=corpus_tf,
                              id2word=description_dict,
                              num_topics=k,
                              minimum_probability=0)
        corpus_lda = lda[corpus_tf]
        print("Obtain Description lsi representation done!")
        featurefile = representationfile + str(k) + ".txt"
        fout = open(featurefile, 'w')
        for doc in corpus_lda:
            line = []
            for i in range(k):
                line.append(0)
            for (fid, fvalue) in doc:
                line[fid] = fvalue
            for item in line:
                t = fout.write(str(item) + '\t')
            t = fout.write('\r\n')
        fout.close()
Beispiel #20
0
def norm(arr):
    return np.sqrt(np.sum((arr)**2,axis=0))

def radian(arr1,arr2):
    return np.arccos(np.sum(arr1*arr2,axis=0)/(norm(arr1)*norm(arr2)))

def distance(arr1,arr2):
    return np.sqrt(np.sum((np.abs(arr1-arr2))**2,axis=0))

corpus=pd.read_csv("pacifier.csv",usecols=["review_body"])[:10]["review_body"].values
tokenizer = nltk.RegexpTokenizer(r'\w+')	#去除标点符号的正则过滤器
corpus2=["" for i in range(0,corpus.shape[0])]
for i in range(0,corpus.shape[0]):
    lis=tokenizer.tokenize(corpus[i])
    for word in lis:
        corpus2[i]+= nltk.PorterStemmer().stem(word)+" "
        #将文本中所有单词只保留词干
corpus=np.array(corpus2)
tfidf_vectorizer = TfidfVectorizer() 
tfidf = tfidf_vectorizer.fit_transform(corpus).toarray()
feature_name = tfidf_vectorizer.get_feature_names()
sm_dis=np.zeros((tfidf.shape[0],tfidf.shape[0]))
sm_rad=np.zeros((tfidf.shape[0],tfidf.shape[0]))
for j in range(0,tfidf.shape[0]):
    for i in range(j,tfidf.shape[0]):
        sm_dis[j][i]=distance(tfidf[j],tfidf[i])
        sm_dis[i][j]=sm_dis[j][i]
        sm_rad[j][i]=radian(tfidf[j],tfidf[i])
        sm_rad[i][j]=sm_rad[j][i]

# %%
Beispiel #21
0
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = text.rstrip('?')
    text = text.rstrip(',')
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])

    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        #stemmer = SnowballStemmer('english')
        #stemmed_words = [stemmer.stem(word) for word in text]
        stemmed_words = [nltk.PorterStemmer().stem_word(word.lower()) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)
Beispiel #22
0
def main():
    data_dir = './data'
    tweet_source_file = 'samples.txt'

    porter = nltk.PorterStemmer()
    stops = set(stopwords.words('english'))

    ## Load and process sample tweets
    print('start loading and process samples...')
    hashtags_stat = {
    }  # record statistics of the df and tf for each hashtag; Form: {tag:[tf, df, tweet index]}
    hashtags = []
    with open(os.path.join(data_dir, tweet_source_file)) as f:
        for i, line in enumerate(f):
            postprocess_hashtag_list = []
            tweet_obj = json.loads(line.strip(), encoding='utf-8')
            hashtag_list = tweet_obj['entities']['hashtags']
            no_of_hashtags = len(hashtag_list)
            hashtag_text_list = []
            if no_of_hashtags == 0:
                # joined_postprocess_tags = ''
                joined_postprocess_tags = 'void'
                # hashtags.append(joined_postprocess_tags)
            else:
                for j in range(no_of_hashtags):
                    hashtag_text_list.append(hashtag_list[j]['text'])
                joined_tags = ' '.join(hashtag_text_list)
                tags = pre_process(joined_tags, porter)
                for tag in tags:
                    if tag not in stops:
                        postprocess_hashtag_list.append(tag)
                        if tag in hashtags_stat.keys():
                            hashtags_stat[tag][0] += 1
                            if i != hashtags_stat[tag][2]:
                                hashtags_stat[tag][1] += 1
                                hashtags_stat[tag][2] = i
                        else:
                            hashtags_stat[tag] = [1, 1, i]
                joined_postprocess_tags = ' '.join(postprocess_hashtag_list)
            hashtags.append(joined_postprocess_tags)
        # print(hashtags[:50])

    ## Save the statistics of tf and df for each hashtag into file
    print("The number of unique words in data set is %i." %
          len(hashtags_stat.keys()))
    lowTF_tags = set()
    stats_dir = './stats'
    with open(os.path.join(stats_dir, 'hashtags_statistics.txt'), 'w') as f:
        f.write('TF\tDF\tHASHTAG\n')
        for tag, stat in sorted(hashtags_stat.items(),
                                key=lambda i: i[1],
                                reverse=True):
            f.write('\t'.join([str(m) for m in stat[0:2]]) + '\t' + tag + '\n')
            if stat[0] < 2:
                lowTF_tags.add(tag)
    print("The number of low frequency words is %d." % len(lowTF_tags))

    ## Re-process samples, filter low frequency hashtags...
    features_dir = './features'
    fout = open(os.path.join(features_dir, 'hashtags_processed.txt'), 'w')
    new_hashtags_list = []
    for hashtag in hashtags:
        tags = hashtag.split(' ')
        new = []
        for tag in tags:
            if tag not in lowTF_tags:
                new.append(tag)
        if len(new) == 0:
            new.append('void')
        new_hashtags = ' '.join(new)
        new_hashtags_list.append(new_hashtags)
        fout.write('%s\n' % new_hashtags)
    fout.close()

    print("Preprocessing is completed")
def stem(tokens):
    porter = nltk.PorterStemmer()
    return [porter.stem(x) for x in tokens]
Beispiel #24
0
def stem(msg: str) -> str:
    stemmer = nltk.PorterStemmer()
    return ' '.join(stemmer.stem(term) for term in msg.split())
Beispiel #25
0
def jerry_learn():
    key_file = 'keys.json'
    with open(key_file) as f:
        keys = json.load(f)
    auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
    auth.set_access_token(keys["access_token"], keys["access_token_secret"])
    api = tweepy.API(auth, wait_on_rate_limit=True)
    today = date.today()
    today = datetime(today.year, today.month, today.day)
    week_ago = today - timedelta(days=1)
    start = week_ago.strftime('%Y-%m-%d %H:%M:%S')[0:10]
    timestamp = []
    user = []
    text = []
    retweet_count = []
    i = 0
    for tweet in tweepy.Cursor(api.search, q = '#bitcoin', lang="en", since = start).items():
        i += 1
        timestamp.append(tweet.created_at)
        retweet_count.append(tweet.retweet_count)
        text.append(tweet.text)
        user.append(tweet.user.screen_name)
        if i > 1500:
            break
    start2 = int(round(timestamp[-1].replace(tzinfo=timezone.utc).timestamp()))
    rawlink = "http://api.bitcoincharts.com/v1/trades.csv?symbol=bitstampUSD"
    link = rawlink + "&start=" + str(int(round(start2)))
    filename = wget.download(link)
    btcprice = pd.read_csv(filename, header = None)
    btcprice.columns = ['unixtime', 'price', 'amount']
    converted_time = btcprice['unixtime'].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
    d = {'timestamp': timestamp, 'user': user, 'text' : text, 'retweet' : retweet_count}
    df = pd.DataFrame(data = d)
    df.to_csv("most_recent_tweet.csv")
    btcprice['timestamp'] = converted_time
    btcprice2 = btcprice.iloc[::50, :].reset_index()
    del btcprice2['index']
    df2 = df.iloc[::-1].reset_index()
    del df2['index']
    btcprice2['timestamp'] = btcprice2['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    def cal_direction(array):
        direction = np.ones(len(array))
        for i in range(len(array) - 1):
            if array[i + 1] - array[i] < 0:
                direction[i + 1] = 0
        return(direction)
    btcprice2 = btcprice2.assign(direction = cal_direction(btcprice2['price'].values))
    direction_tweet = np.zeros(len(df2))
    for x in range(len(df2)):
        for y in range(len(btcprice2)):
            if (btcprice2.loc[y, 'timestamp'] > df2.loc[x, 'timestamp']):
                direction_tweet[x] = btcprice2.loc[y, 'direction']
                break
    stopwords = nltk.corpus.stopwords.words('english')
    ps = nltk.PorterStemmer()
    def clean_text(text):
        text = "".join([word.lower() for word in text if word not in string.punctuation])
        tokens = re.split('\W+', text)
        text = [ps.stem(word) for word in tokens if word not in stopwords]
        return text
    tfidf_vec = TfidfVectorizer(analyzer=clean_text)
    x_tfidf = tfidf_vec.fit_transform(df2['text'])
    x_tfidf.columns = tfidf_vec.get_feature_names()
    x_counts_tfidf = pd.DataFrame(x_tfidf.toarray())
    x_feature = pd.concat([df2[['retweet']], x_counts_tfidf], axis = 1)
    x_feature2 = x_feature.loc[:int(round(0.8*len(x_feature)))-1, :]
    direction_tweet2 = direction_tweet[:int(round(0.8*len(direction_tweet)))]
    x_est = x_feature.loc[int(round(0.8*len(x_feature))):, :]
    train_size = int(round(0.8*len(x_feature2)))
    x_train = x_feature2.loc[:train_size-1, :]
    x_test = x_feature2.loc[train_size:, :]
    y_train = direction_tweet2[:train_size]
    y_test = direction_tweet2[train_size:]
    rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
    rf_model = rf.fit(x_train, y_train)
    y_pred = rf_model.predict(x_test)
    label = None
    if sum(y_pred == 0) >= sum(y_pred == 1):
        label = 0
    else:
        lebel = 1
    precision, recall, fscore, support = score(y_test, y_pred, pos_label= label, average='binary')
    val1 = 'Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                            round(recall, 3),
                                                            round((y_pred==y_test).sum() / len(y_pred),3))
    y_est = rf_model.predict(x_est)
    p1 = sum(y_est == 1)
    p0 = sum(y_est == 0)
    val2 = None
    if p1 > p0:
        val2 = "The random forest model detects an upward trend based on conversations on tweet with a probability of " + str(p1/len(y_est))
    else:
        val2 = "The random forest model detects an downward trend based on conversations on tweet with a probability of " + str(p0/len(y_est))
    return val1, val2
Beispiel #26
0
def getBestWords(trainSet):
    # extract features for each review and store in list of tuples pertaining to each review
    # this is the training data to be passed to the classifier
    word_freq = nltk.probability.FreqDist()
    label_freq = nltk.probability.ConditionalFreqDist()
    stemmer = nltk.PorterStemmer()

    print("Getting word frequency..")
    i = 0
    for review in trainSet:
        if (review[2] == 'pos'):
            words = [stemmer.stem(x.lower()) for x in review[3]]

            word_freq.update(nltk.probability.FreqDist(words))
            word_freq.update(
                nltk.probability.FreqDist([x.lower() for x in review[3]]))
            label_freq['pos'].update(
                nltk.probability.FreqDist([x.lower() for x in review[3]]))
            label_freq['pos'].update(nltk.probability.FreqDist(words))
        elif (review[2] == 'neg'):
            words = [stemmer.stem(x.lower()) for x in review[3]]

            word_freq.update(nltk.probability.FreqDist(words))
            word_freq.update(
                nltk.probability.FreqDist([x.lower() for x in review[3]]))
            label_freq['neg'].update(
                nltk.probability.FreqDist([x.lower() for x in review[3]]))
            label_freq['neg'].update(nltk.probability.FreqDist(words))

        if (i % 20 == 0):
            print(".", end="")
        if (i % 1000 == 0):
            print(str(i))
        i = i + 1

    print(str(i) + " Finished")
    pos_words = label_freq['pos'].N()
    neg_words = label_freq['neg'].N()
    total_words = pos_words + neg_words
    word_scores = {}

    print("Calculating word scores..")
    for word, freq in word_freq.iteritems():
        pos_score = nltk.BigramAssocMeasures.chi_sq(label_freq['pos'][word],
                                                    (freq, pos_words),
                                                    total_words)
        neg_score = nltk.BigramAssocMeasures.chi_sq(label_freq['neg'][word],
                                                    (freq, neg_words),
                                                    total_words)
        tag = nltk.pos_tag([word])[0][1]
        if (tag.__contains__('VB') or tag.__contains__('NN')
                or tag.__contains__('RB') or tag.__contains__('JJ')):
            word_scores[word] = pos_score + neg_score

    print("Sorting Word scores..")
    best = sorted(word_scores.iteritems(), key=lambda (w, s): s,
                  reverse=True)[:5000]
    print("Getting Best words..")
    bestwords = set([w for w, s in best])

    return bestwords
Beispiel #27
0
# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change all to lower case
processed = processed.str.lower()

# remove stop words
stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

# print the total number of words and the 15 most common words
# print('Number of words: {}'.format(len(all_words)))
    def __init__(self,
                 path,
                 StopwordRemoval=False,
                 Stemming=False,
                 Debug=False):
        #run in corpus folder
        for root, directories, documents in os.walk(path):
            # filter
            documents = [f for f in documents if f.endswith('.html')]
            # use a small part of the corpus
            if Debug:
                documents = [
                    f for f in documents if int(f.split('.')[0]) < 1000
                ]

        # build index
        self._documents = documents
        if Debug:
            print('stopword removal:{0}, stemming:{1}'.format(
                StopwordRemoval, Stemming))
        if StopwordRemoval:
            self._stop = nltk.corpus.stopwords.words('english')
        if Stemming:
            self._stemmer = nltk.PorterStemmer()

        #index is a dict of dict
        #self._index = defaultdict(lambda : defaultdict(int))
        self._index = defaultdict(dd)

        self.N = 0
        self.docIDs = []
        for document in documents:
            if Debug:
                print('processing document {0}...'.format(document))
            try:
                '''example of document : 21393.html
				documentID : 21393
				split document name by dot and parse first part as int'''
                documentID = int(document.split('.')[0])
                print("parsing Doc {0}".format(self.N))
                self.docIDs.append(documentID)
                #read content and tokenization
                content = open(path + '/' + document, errors='ignore')
                raw = content.read()
                raw = raw.lower()
                tokens = nltk.word_tokenize(raw)
                if StopwordRemoval:
                    tokens = [
                        token for token in tokens if token not in self._stop
                    ]

                def StemToken(token):
                    return self._stemmer.stem(token)

                if Stemming:
                    tokens = map(StemToken(token), tokens)
                    #tokens = map(lambda token:self._stemmer.stem(token), tokens)
                '''
				index[token][documentID] -- term frequency -- tf
				size of dict index[token] -- document frequency -- df
				'''
                for token in tokens:
                    self._index[token][documentID] += 1

                self.N = self.N + 1
            except Exception as e:
                print('error occur when reading {0}'.format(documentID))
                raise e
Beispiel #29
0
        #print(READ_DATA)

        # eliminate punctuation
        for char in string.punctuation:
            READ_DATA = READ_DATA.replace(char, ' ')
        #print(READ_DATA)

        # eliminate numbers
        for char in string.digits:
            READ_DATA = READ_DATA.replace(char, ' ')
        #print(READ_DATA)

        # perform stemming using nltk stemmer
        tokens = nltk.word_tokenize(READ_DATA)

        porter = nltk.PorterStemmer()
        looper = 0
        for token in tokens:
            tokens[looper] = porter.stem(token)
            looper += 1
        #print "Stemmed -->"
        #print tokens

        for token in tokens:
            #print(token)

            #check if word exists in dictionary
            if list['words'].has_key(token):
                #print(token + ' exists')

                #check if doc already in word's docList
Beispiel #30
0
    def add_details(self, details=None, commit=False, **kwargs):
        """
        Adds arbitrary key-value pairs to this entry.

        Parameters
        ----------
        details : list
            .. versionadded:: 0.1.8
            List of dict of structure:
            .. code-block::
                [{
                    'key': '',
                    'value': '',
                    'description': ''
                }]
            where the ``description`` is optional and can be omitted.
            If no descriptions are passed at all, you can also use `**kwargs`
            to pass ``key=value`` pairs.
        commit : bool
            If True, the Entry session will be added to the
            current session and the transaction is commited.
            Can have side-effects. Defaults to False.

        """
        ps = nltk.PorterStemmer()

        # build entries here
        detail_list = []

        # parse kwargs
        for k, v in kwargs.items():
            detail_list.append({
                'entry_id': self.id,
                'key': str(k),
                'stem': ps.stem(k),
                'value': v
            })

        # parse details
        if details is not None:
            for detail in details:
                d = {
                    'entry_id': self.id,
                    'key': detail['key'],
                    'stem': ps.stem(detail['key']),
                    'value': detail['value']
                }
                if 'description' in detail.keys():
                    d['description'] = detail['description']
                detail_list.append(d)

        # build the models
        for detail in detail_list:
            self.details.append(models.Detail(**detail))

        if commit:
            session = object_session(self)
            try:
                session.add(self)
                session.commit()
            except Exception as e:
                session.rollback()
                raise e