def perprocessing(tdic):
    new_dic = {}
    POS_feature = []
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        # print(text_tk)
        print(text_tk)
        telist = []
        for word in text_tk:
            word = word.lower()
            # ps = nltk.stem.PorterStemmer()
            # word = ps.stem(word)
            telist.append(word)
        # print(telist)
        afterlemma = lemma(telist)
        telist = afterlemma[0]
        POS_feature.append(afterlemma[1])
        # print(telist)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)  #now preprocess . change to URLINK SADFACE
        print(newtext)
        new_dic[id] = gt, newtext
    return new_dic, np.array(POS_feature)
Exemple #2
0
    def parse_text(tw_obj):
        # remove use mentions, urls from the text
        # use extended tweet if presents
        if 'extended_tweet' in tw_obj:
            text = tw_obj['extended_tweet']['full_text']
        # or use normal text
        else:
            text = tw_obj['text']

        # process quoted tweet and append to text
        if tw_obj['is_quote_status'] and 'quoted_status' in tw_obj:
            # process quoted tweet
            qt_obj = tw_obj['quoted_status']
            if 'extended_tweet' in qt_obj:
                qt_text = qt_obj['extended_tweet']['full_text']
            # or use normal text
            else:
                qt_text = qt_obj['text']
            text = ''.join([text, ' %QUOTES% ', qt_text])

        text_norm = normalizeTextForTagger(replace_sp_tokens(text))
        # process text into list of keywords
        text_tokens = get_tokens(text)
        text_tokens = [t for t in text_tokens if t not in stopwords]
        token_counts = dict(Counter(itertools.chain(*[text_tokens])))
        # text_tokens = [lemma(t) for t in text_tokens]

        return text, text_norm, text_tokens, token_counts
Exemple #3
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        telist = []
        for word in text_tk:
            word = word.lower()
            ps = nltk.stem.PorterStemmer()
            word = ps.stem(word)
            # word = nltk.stem.SnowballStemmer(word)
            telist.append(word)
        # 	return ''.join(ans)
        # newtext = ?telist
        # newtext = ' '.join(text_tk)
        newtext = ' '.join(telist)
        # print(newtext)
        newtext = textPreprocessor01.replaceall(newtext)
        new_dic[id] = gt, newtext
        # print(type(tdic[line][1]))
        # print(line)
        # print(type(line))
        # print(type(newtext))
        # print(newtext)
    return new_dic
Exemple #4
0
def file_list(filepath,include_columns):
    ifile  = open(filepath, "rb")
    reader = csv.reader(ifile)
    rownum = 0
    print("Reading Tweet")
    hashwords = re.compile("#\S*",re.I)
    linkwords = re.compile("http\S*",re.I)
    reference = re.compile("@\S*",re.I)
    listo = []
    for row in reader:
        if rownum == 0:
            header = row
        else:
            listi = []
            for col in include_columns:
                if col == 1:
                    time = datetime.datetime.strptime(row[col], "%a %b %d %H:%M:%S +0000 %Y")
                    listi.append(time)
                elif col == 6:
                    query = str(row[col])
                    for res in re.finditer(linkwords,query):
                        query = query.replace(res.group(),"")
                    for res in re.finditer(hashwords,query):
                        query = query.replace(res.group(),"")
                    for res in re.finditer(reference,query):
                        query = query.replace(res.group(),"")
                    query = tk.squeezeWhitespace(query)
                    query = tk.normalizeTextForTagger(query.decode('latin-1').encode("utf-8").decode('utf8'))
                    listi.append(query)
                else:
                    listi.append(row[col])
            listo.append(listi)
        rownum += 1
    return listo
Exemple #5
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        # print(text_tk)
        newtext = ' '.join(text_tk)
        newtext = textPreprocessor01.replaceall(newtext)
        new_dic[id] = gt, newtext
        # print(type(tdic[line][1]))
        # print(line)
        # print(type(line))
        # print(type(newtext))
        # print(newtext)
    return new_dic
Exemple #6
0
def normalize(text):
	# for easier comparison 
	text = text.lower()
	# to avoid IndexError: string index out of range (usually too much indent)
	text = squeezeWhitespace(text)
	# replace common abbreviation or unreadable characters 
	text = text.replace(">", ">")
	text = text.replace("&", "&")
	text = text.replace("w/", "with")
	text = text.replace('\u2019', "'")
	text = text.replace('\u2026', "...")
	# remove urls
	text = re.sub(r'http\S+', '', text)    #replace URL links with the term 'URL'
	# remove keyword 'springbreak'
	text = text.replace('springbreak', '')
	text = text.replace('spring', '')
	text = text.replace('break', '')
	return normalizeTextForTagger(text)
Exemple #7
0
def perprocessing(tdic):
    new_dic = {}
    for line in tdic:
        id = line
        gt = tdic[line][0]
        raw = ' '.join(twokenize.tokenizeRawTweetText(tdic[line][1]))
        text = twokenize.normalizeTextForTagger(raw)
        text_tk = twokenize.tokenize(text)
        telist = []
        for word in text_tk:
            word = word.lower()
            ps = nltk.stem.PorterStemmer()
            word = ps.stem(word)
            telist.append(word)
        newtext = ' '.join(telist)
        newtext = textPreprocessor02.replaceall(newtext)
        new_dic[id] = gt, newtext
    return new_dic
Exemple #8
0
def file_list(filepath, include_columns):
    ifile = open(filepath, "rb")
    reader = csv.reader(ifile)
    rownum = 0

    hashwords = re.compile("#\S*", re.I)
    linkwords = re.compile("http\S*", re.I)
    reference = re.compile("@\S*", re.I)

    listo = []
    for row in reader:
        if rownum == 0:
            header = row
        else:
            listi = []
            for col in include_columns:
                if col == 1:
                    time = datetime.datetime.strptime(
                        row[col], "%a %b %d %H:%M:%S +0000 %Y")
                    listi.append(time)
                elif col == 6:
                    query = str(row[col])
                    for res in re.finditer(linkwords, query):
                        query = query.replace(res.group(), "")
                    for res in re.finditer(hashwords, query):
                        query = query.replace(res.group(), "")
                    for res in re.finditer(reference, query):
                        query = query.replace(res.group(), "")
                    query = tk.squeezeWhitespace(query)
                    query = tk.normalizeTextForTagger(
                        query.decode('latin-1').encode("utf-8").decode('utf8'))
                    listi.append(query)
                else:
                    listi.append(row[col])
            listo.append(listi)
        rownum += 1
    return listo
Exemple #9
0
def tokenize(tweet):
    return twokenize.tokenize(twokenize.normalizeTextForTagger(tweet))
Exemple #10
0
def get_tokens(text):
    return tokenize(clean(replace_sp_tokens(normalizeTextForTagger(text))))
def train_classifier(classpath):
##ifile  = open('F:/Srinjay/Tweet Feed/Tweet Feed/classsified.csv', "rb")
    ifile  = open(classpath, "rb")
    reader = csv.reader(ifile)
    rownum = 0
    tweetTime = []
    tweetDesc = []
    tweetR = []
    tweetToken = []
    tweet = ""
    ps = PorterStemmer()
    for row in reader:
        if rownum == 0:
            header = row
        else:
            colnum = 0
            if (row[0].find("+0000")!=-1):
                tweet = row[0].lower()
                tweetR.append(row[1])
                brk = tweet.index('+0000')+10
                tweetTime.append(datetime.datetime.strptime(tweet[:brk], "%a %b %d %H:%M:%S +0000 %Y"))
                x = tweet[brk:]
                squeeze = tk.squeezeWhitespace(x)
                normal = tk.normalizeTextForTagger(squeeze.decode('utf8'))
                tweetDesc.append(normal)
                punct_num = re.compile(r'[-.?!,":;()|0-9]')
                time_pat = re.compile("(\d{1,2}(.\d{1,2})|\d{1,2})(am|pm|AM|Am|PM|Pm)")
                date_pat = re.compile("\d{1,2}\/\d{1,2}")
                week_pat = re.compile("Sun|Mon|Tue|Wed|Thurs|Fri|Sat|sunday|monday|tuesday|wednesday|thursday|friday|saturday/",re.I)
    ##            print(rownum,normal)
                if(time_pat.search(normal)):
                    normal = normal + " timepresent"
                if(date_pat.search(normal)):
                    normal = normal + " datepresent"
                if(week_pat.search(normal)):
                    normal = normal + " weekpresent"
                normal = re.sub(time_pat, '', normal)
                normal = re.sub(date_pat, '', normal)
                normal = re.sub(week_pat, '', normal)
                normal = punct_num.sub("", normal)
                tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
                b = tokenizer.tokenize(normal)
                b = [i for i in b if (i not in stop)]
                token = [ps.stem(i) for i in b]
    ##            print(rownum)
                tweetToken.append(token)
                
        rownum += 1
    ifile.close()

    ## feature engineering

    documents = []
    all_words = []
    tweet_non = []
    tweet_rel = []
    for i in range(0,len(tweetR)):
        documents.append((tweetToken[i],tweetR[i]))
        all_words.extend(tweetToken[i])
        if (tweetR[i] == 'Non-Relevant'):
            tweet_non.extend(tweetToken[i])
        else:
            tweet_rel.extend(tweetToken[i])
            
    all_words_freq = nltk.FreqDist(all_words)
    rel_words_freq = nltk.FreqDist(tweet_rel)
    non_words_freq = nltk.FreqDist(tweet_non)

    ##ranked words according to c/n ratio with add 1 smoothing

    init_features = list(all_words_freq.keys())
    score_words = []
    for i in init_features:
        score_words.append([float(rel_words_freq[i]+1)/float(non_words_freq[i]+1),i])
    score_words = sorted(score_words, reverse=True)
    scores = []
    scores = [i[0] for i in score_words]
    scores_mean = numpy.average(scores)
    features_1 = [];

    #random sample gneration 2000 - train, rest - test


    a = (r.uniform(0,len(tweetToken),2000))
    b = [int(i) for i in a]

    accu = []
    threshold = range(5,20,2)
    threshold = [float(i)/10 for i in threshold]
    threshold = 0.7
    features_1 = [];
    for i in range(0,len(score_words)):
        if score_words[i][0]>threshold:
            features_1.append(score_words[i][1])

    feature_score1=[]
    for i in range(0,len(tweetR)):
        feature_score1.append([find_features(tweetToken[i],features_1),tweetR[i]])

    trainingset = []
    for i in b:
        trainingset.append(feature_score1[i])

    testset = [x for x in feature_score1 if x not in trainingset]


    ##naive base

    naive = nltk.NaiveBayesClassifier.train(trainingset)
    accuracy = nltk.classify.accuracy(naive,testset)
    classifier = [naive,features_1,accuracy]

    return classifier
Exemple #12
0
            
            tweetR.append(row[1])

            # Add the value 10 to +0000 because the tweet text starts after 10 indexes
            brk = tweet.index('+0000')+10

            
            tweetTime.append(datetime.datetime.strptime(tweet[:brk], "%a %b %d %H:%M:%S +0000 %Y"))
            TRACE(2,tweetTime)
            
            x = tweet[brk:]

            squeeze = tk.squeezeWhitespace(x)
            TRACE(3,squeeze)
            
            normal = tk.normalizeTextForTagger(squeeze.decode('utf8'))

            TRACE(4,normal)
            
            tweetDesc.append(normal)

            TRACE(5,tweetDesc)
            
            punct_num = re.compile(r'[-.?!,":;()|0-9]')
            time_pat = re.compile("(\d{1,2}(.\d{1,2})|\d{1,2})(am|pm|AM|Am|PM|Pm)")
            date_pat = re.compile("\d{1,2}\/\d{1,2}")
            week_pat = re.compile("Sun|Mon|Tue|Wed|Thurs|Fri|Sat|sunday|monday|tuesday|wednesday|thursday|friday|saturday/",re.I)

            #TRACE(6,find_events(normal))
            
            if(time_pat.search(normal)):