Ejemplo n.º 1
0
def compute_tfidf(text, filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara

    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)

    for paraList in colList:
        dict = {}
        for term in paraList:
            print term, "has weight: ", collection.tf_idf(term, paraList)
            dict[term] = collection.tf_idf(term, paraList)
        '''
        print "BEFORE  <><><><><<><<>><><><><><><><>><><  ",type(dict)
        for key,value in dict.iteritems():
            print key," ",value
        '''
        d = sortDict(dict)
        print "AFTER SORTED  <><><><><<><<>><><><><><><><>><><  ", type(d)
        textFile = open(filename, "a")
        textFile.write("\n")
        for key, value in d:
            s = str(key) + "\t" + str(value) + "\n"
            textFile.write(s)
Ejemplo n.º 2
0
def compute_tfidf(text,filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara
    
    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)
     
    for paraList in colList:
        dict={}
        for term in paraList:
            print term, "has weight: ", collection.tf_idf(term,paraList)
            dict[term]= collection.tf_idf(term,paraList)
        '''
        print "BEFORE  <><><><><<><<>><><><><><><><>><><  ",type(dict)
        for key,value in dict.iteritems():
            print key," ",value
        '''
        d=sortDict(dict)
        print "AFTER SORTED  <><><><><<><<>><><><><><><><>><><  ",type(d)
        textFile=open(filename,"a")
        textFile.write("\n")
        for key,value in d:
            s = str(key) + "\t" + str(value)+"\n"
            textFile.write(s)
Ejemplo n.º 3
0
def compute_tf_idf(question, messages):
    import math

    texts = [question.keywords]
    total_length = 0
    for m in messages:
        total_length += len(m.keywords)
        text = Text(tokens=m.keywords)
        texts.append(text)
    text_collection = TextCollection(texts)
    question_tfidf_score = 0
    for k in question.keywords:
        tf_idf = text_collection.tf_idf(k, texts[0])
        question_tfidf_score += tf_idf

    if question_tfidf_score == 0:
        question_tfidf_score = 0.2
    if total_length == 0:
        total_length = 1
    length_factor = len(question.keywords) / total_length
    score = length_factor * math.log2(question_tfidf_score * 10)
    base_score = score
    if base_score == 0:
        base_score = 1

    print(question.content, question_tfidf_score, length_factor, score)
    print("^^^^^^^^^^^^^^^^^^^^^^^^^^")
    scores = []
    total_score = score
    print("Math", math)
    for i in range(0, len(messages)):
        tf_idf_i = 0
        for k in messages[i].keywords:
            tf_idf = text_collection.tf_idf(k, texts[i + 1])
            tf_idf_i += tf_idf
        if tf_idf_i == 0:
            continue
        length_factor = len(messages[i].keywords) / total_length
        score = length_factor * math.log2(tf_idf_i * 10)
        scores.append(score)
        total_score += score
        print(messages[i].content, tf_idf_i, length_factor, score)
        print("++++++++++++++++++++++++++++++++")
        # print(scores)
    averaged_scores = []
    last_message = question
    results = [last_message]
    for i in range(0, len(scores)):
        averaged_score = scores[i] / base_score
        averaged_scores.append(averaged_score)
        if averaged_score < 0.52:
            last_message.comments.append(messages[i])
        else:
            last_message = messages[i]
            results.append(last_message)
    print(averaged_scores)
    return results
Ejemplo n.º 4
0
def nltk_tf_idf(corpus_one, file_name):
    print('-----starting nltk_tf_idf')
    corpus_one = [nltk.word_tokenize(doc) for doc in corpus_one]
    texts = TextCollection(corpus_one)

    for doc in corpus_one:
        yield {term: texts.tf_idf(term, doc) for term in doc}
Ejemplo n.º 5
0
def vectorize_t(corpus):
    #corpus = [tokenize(doc) for doc in corpus]
    texts = TextCollection(corpus)
    return {
        term: texts.tf_idf(term, corpus)
        for term in corpus
    }
Ejemplo n.º 6
0
def sentenceAlignment(simpleParas, normalParas, pairedPara):
    for key,value in pairedPara.items(): # key is simple and value in normal
        print "**********************************"
        print "PARAGRAPH"
        print "##################################"
        
        SPara = simpleParas[key]
        NPara = normalParas[value]
        
        # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list
        colList, sslist,nslist = formSentenceList(SPara,NPara)
        collection = TextCollection(colList)

        dict={}
        for sentence in colList:
            weight = 0
            
            for term in sentence:
                weight = collection.tf_idf(term,sentence)
                print "TERM -> ",term, "is",weight
                # what if the term is already in the dic, we need to add the weight
                if(term not in dict):
                    dict[term] = weight
                # dict[term] = weight
            
            #dict = sortDict(dict)
        print "================================================================"    

        '''
Ejemplo n.º 7
0
    def __vectorize(self, corpus):
        corpus = [list(self.__tokenize(doc)) for doc in corpus]

        texts = TextCollection(corpus)

        for doc in corpus:
            yield {term: texts.tf_idf(term, doc) for term in doc}
Ejemplo n.º 8
0
    def get_tf_idf_dict_nltk(
            self,
            column_type="review_body",
            save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"):
        '''
            ### nltk version
            it's super slow so don't use it
        '''
        reviews = self.raw_df[column_type].tolist()

        # get clean header
        reviews_list_cleaned = clean_tsv(reviews)

        # get all words
        words = set()
        for reviews in reviews_list_cleaned:
            for review in reviews:
                words.add(review)

        words = list(words)

        corpus = TextCollection(reviews_list_cleaned)

        tf_idf = []
        for word in words:
            tf_idf.append(corpus.tf_idf(word, corpus))

        df = pd.DataFrame({"word": words, "tf-idf": tf_idf})
        df.to_csv(save_path, encoding='utf-8')
Ejemplo n.º 9
0
def ranking(reuters, corpus, docids, palavras):
    '''Cria um ranqueamento entre os textos da busca, sendo o primeiro o mais relevante

    Args:
        reuters: corpus vindo do nltk
        corpus: dicionário contendo a relação entre índice e texto
        docids: índices dos textos buscados
        palavras: palavras tokenizadas da query
    
    Returns:
        Lista com todas os índices já ranqueados
    '''
    rank = {}
    tc = TextCollection(reuters)

    for e in docids:
        rank[e] = 0
        for i in palavras:
            rank[e] += tc.tf_idf(i, corpus[e])

    rank = {
        k: v
        for k, v in reversed(sorted(rank.items(), key=lambda item: item[1]))
    }
    return rank.keys()
 def tf_idf(self):
     corpus = [
         list(self.cr.tokenize_strip_punct(desc))
         for desc in self.cr.texts()
     ]
     texts = TextCollection(corpus)
     for desc in corpus:
         yield {term: texts.tf_idf(term, desc) for term in desc}
Ejemplo n.º 11
0
def nltk_tfidf_vectorize(corpus):
    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {term: texts.tf_idf(term, doc) for term in doc}
Ejemplo n.º 12
0
def vectorize(corpus):
    corpus_tokenized = [list(tokenize(doc)) for doc in corpus]
    texts  = TextCollection(corpus_tokenized)
    
    for doc in corpus_tokenized:
        return {
            term: texts.tf_idf(term, doc)
            for term in doc
        }
Ejemplo n.º 13
0
def tf_idf_vectorize_nltk(corpus):
    print(corpus)
    #corpus = [tokenize(doc) for doc in corpus]
    texts  = TextCollection(corpus)
    print(texts)
    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }
Ejemplo n.º 14
0
def run_main():
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terrible movie'

    tf_analy = TextCollection([text1, text2, text3, text4, text5])

    new_text = 'That one is a good movie. This is so good!'
    word = 'That'
    tf_idf_val = tf_analy.tf_idf(word, new_text)
    print(tf_idf_val)
Ejemplo n.º 15
0
def Generate_keyword(obj,length):
    orig_file = './Data/'+obj+'/'+obj+'.xlsx'
    data = xlrd.open_workbook(filename=orig_file)
    sheet = data.sheet_by_index(1)
    review_head = np.array(sheet.col_values(12))[1:]
    review_body = np.array(sheet.col_values(13))[1:]
    
    review_all=[]
    for i in range(length) :
        review = review_head[i] + " " +review_body[i]
        review_all.append(review)
    review_all = np.array(review_all)
    
    # make review tokens
    tokens=[]
    for i,review in enumerate(review_all):
        review = review.lower()
        replacer = RegexpReplacer()
        review = replacer.replace(review)
        remove = str.maketrans('','',string.punctuation) 
        review = review.translate(remove)
        token = nltk.word_tokenize(review)
        token = [w for w in token if w == 'not' or 
                 not w in stopwords.words('english')] 
        s = nltk.stem.SnowballStemmer('english')  
        token = [s.stem(ws) for ws in token]
        tokens.append(token)
    token_file = './Data/'+ obj +'/tokens.pkl'
    f=open(token_file,'wb')
    pickle.dump(tokens,f)
    f.close()
    
    corpus=TextCollection(tokens) 
    
    tf={}
    tf_idf={}
    for review in tokens:
        for word in review:
            if word not in tf :
                tf_=corpus.tf(word,corpus)
                tf[word]=tf_
            if word not in tf_idf :
                tf_idf_=corpus.tf_idf(word,corpus)
                tf_idf[word] = tf_idf_
                
    tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True)
    tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1],
                           reverse=True)
    
    pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv')
    pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
Ejemplo n.º 16
0
def train_NB_tfidf_nltk(train_data,test_data,all_rev):   
    all_rev = [nltk.word_tokenize(rev) for rev in all_rev]
    corpus = TextCollection(all_rev)
    labels = train_data['label']
    train_rev = train_data['review']
    ID = test_data['ID']
    lab = get_lab(labels)
    fs_train = []
    print(train_rev[0])
    for i in range(0,len(train_rev)):
        cut_rev = nltk.word_tokenize(train_rev[i])    
        fs_dict = {}
        for j in range(0,len(cut_rev)):
            fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],train_rev[i])
        fs_train.append((fs_dict,int(lab[i])))
    fs_test = []
    for i in range(0,len(test_rev)):
        cut_rev = nltk.word_tokenize(test_rev[i])    
        fs_dict = {}
        for j in range(0,len(cut_rev)):
            fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],test_rev[i])
        fs_test.append(fs_dict)
    
    classifier=nltk.NaiveBayesClassifier.train(fs_train)
    label = 1
    train_score = []
    test_score = []
    for i in range(0,len(fs_train)):
        dist = classifier.prob_classify(fs_train[i][0])
        train_score.append(dist.prob(label))
    train_score = np.array(train_score,dtype="float32")
    for i in range(0,len(fs_test)):
        dist = classifier.prob_classify(fs_test[i])
        test_score.append(dist.prob(label))
    test_score = np.array(test_score,dtype="float32")
    print("AUC: ",cal_auc(train_score,lab))
    result = pd.DataFrame({'ID':ID.T,'Pred':test_score.T})
    result.to_csv("./result.csv",index = None)
Ejemplo n.º 17
0
def text_classification():
    """
    文本分类
    :return:
    """
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terrible movie'

    # 构建TextCollection对象
    tc = TextCollection([text1, text2, text3, text4, text5])
    new_text = 'That one is a good movie. This is so good!'
    word = 'That'
    tf_idf_val = tc.tf_idf(word, new_text)
    print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
Ejemplo n.º 18
0
def retrieve_results(n_percentile):
    search_queries = parse_trec('documents/irg_queries.trec')
    search_collections = parse_trec('documents/irg_collection_clean.trec')
    # search_collections = parse_trec('documents/irg_collection_short.trec')
    # search_collections = eliminate_stopwords(search_collections)
    # write_collection_doc(search_collections, 'documents/irg_collection_clean.trec')

    print('======= Statistics =======')
    print(f'Queries: {len(search_queries)}')
    print(f'Collections: {len(search_collections)}')
    print(f'Removal of {int((1-n_percentile)*100)}%-ile')
    print('==========================')

    # TF-IDF
    document_results = []
    for search_query_id, search_query_text in search_queries.items():
        print(
            f'Current query id: {search_query_id}, text: "{search_query_text}"'
        )
        terms = search_query_text.split(' ')
        documents = keep_n_percentile_most_relevant_words(search_collections,
                                                          search_query_text,
                                                          n=n_percentile)
        document_scores = {}
        search_texts_collection = TextCollection(documents.values())
        for document_id, document_text in documents.items():
            for term in terms:
                current_score = document_scores.get(document_id, 0.0)
                document_scores[
                    document_id] = current_score + search_texts_collection.tf_idf(
                        term, document_text)

        rank = 1
        for document_id, document_scores in sorted(document_scores.items(),
                                                   key=lambda kv: kv[1],
                                                   reverse=True):
            if rank <= 1000:
                document_results.append(
                    Result(search_query_id, document_id, rank,
                           document_scores))
                rank += 1

    result_writer(document_results,
                  f'IE_result_keep_{int(n_percentile*100)}_percentile.trec')
    print('Done')
Ejemplo n.º 19
0
def compute_tf_idf_similarity(query: str, content: str, type: str) -> float:
    """
    Compute the mean tf-idf or tf
     similarity for one sentence with multi query words.
    :param query: a string contain all key word split by one space
    :param content: string list with every content relevent to this query.
    :return: average tf-idf or tf similarity.
    """
    sents = [word_tokenize(content),
             word_tokenize("")]  # add one empty file to smooth.
    corpus = TextCollection(sents)  # 构建语料库

    result_list = []
    for key_word in query.strip(" ").split(" "):
        if type == "tf_idf":
            result_list.append(corpus.tf_idf(key_word, corpus))
        elif type == "tf":
            result_list.append(corpus.tf(key_word, corpus))
        else:
            raise KeyError

    return sum(result_list) / len(result_list)
Ejemplo n.º 20
0
def compute_tfidf(text,filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara
    
    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)
     
    for paraList in colList:
        dict={}
        for term in paraList:
            dict[term]= collection.tf_idf(term,paraList)
        d=sortDict(dict)
        textFile=open(filename,"a")
        textFile.write("\n")

        for key,value in d:
            s = str(key) + "\t" + str(value)+"\n"
            textFile.write(s)
Ejemplo n.º 21
0

standard_position_dict = position_lookup(standard_freq_vector)
# print(standard_position_dict)

sentence = 'this is cool'
freq_vector = [0] * size
tokens = nltk.word_tokenize(sentence)
for word in tokens:
    try:
        freq_vector[standard_position_dict[word]] += 1
    except KeyError:
        continue

# print(freq_vector)

corpus = TextCollection(['this is sentence one',
                         'this is sentence two',
                         'this is sentence three'])

standard_vocab = []
for i in standard_freq_vector:
    standard_vocab.append(i[0])

# print(corpus.tf('is', 'this is sentence four'))

new_sentence = 'this is sentence five'
for word in standard_vocab:
    print(corpus.tf_idf(word, new_sentence))

# 3. 文本分类及TF-IDF

# 3.1 NLTK中的TF-IDF
from nltk.text import TextCollection

text1 = 'I like the movie so much '
text2 = 'That is a good movie '
text3 = 'This is a great one '
text4 = 'That is a really bad movie '
text5 = 'This is a terrible movie'

# 创建TextCollection对象
tc = TextCollection([text1, text2, text3, text4, text5])
new_text = 'That one is a good movie. This is so good!'
word = 'That'
tf_idf_val = tc.tf_idf(word, new_text)
pro_text('{}的TF-IDF值为:{}'.format(word, tf_idf_val))

# 3.1 sklearn中的TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
feat = vectorizer.fit_transform([text1, text2, text3, text4, text5])
print(vectorizer.get_feature_names())
feat_arrary = feat.toarray()
print(feat_arrary.shape)
print(feat_arrary[0, :])
print(vectorizer.transform([new_text]).toarray())

# 3.3 中文中的TF-IDF
ch_text1 = ' 非常失望,剧本完全敷衍了事,主线剧情没突破大家可以理解,可所有的人物都缺乏动机,' \
           '正邪之间、妇联内部都没什么火花。团结-分裂-团结的三段式虽然老套但其实也可以利用积' \
Ejemplo n.º 23
0
def nltk_tfidf_vectorize(lists_of_tokens):
    texts = TextCollection(lists_of_tokens)
    for article in lists_of_tokens:
        yield {term: texts.tf_idf(term, article) for term in article}
Ejemplo n.º 24
0
#!/usr/bin/env python
# _*_ coding:utf-8 _*_

from nltk.text import TextCollection

# 首先, 把所有的文档放到TextCollection类中。
# 这个类会自动帮你断句, 做统计, 做计算
corpus = TextCollection(
    ['this is sentence one', 'this is sentence two', 'this is sentence three'])

# 直接就能算出tfidf
# (term: 一句话中的某个term, text: 这句话)
print(corpus.tf_idf('this', 'this is sentence four'))
# 0.444342

# 同理, 怎么得到一个标准大小的vector来表示所有的句子?

# 对于每个新句子
new_sentence = 'this is sentence five'
# 遍历一遍所有的vocabulary中的词:
for word in standard_vocab:
    print(corpus.tf_idf(word, new_sentence))
    # 我们会得到一个巨长(=所有vocab长度)的向量
Ejemplo n.º 25
0
xmlcollection.get_words_by_editdistance(editdistance=editdistance,
                                        no_of_most_freq=no_of_topwords)

# Write the found sets to disk; also write most frequent words to disk.
xmlcollection.write_words_by_editdistance(editdistance=editdistance)
xmlcollection.write_topwords(no_of_words=no_of_topwords)
print "Top words written to disk."

# XXX: BIG F**K UP ################################## FIX FIX FIX #####

# Print idf, tf and tf-idf values for the term "CCC", in document
# no. 42 - for testing.
nltk_textcollection = TextCollection(xmlcollection.get_words())
print "idf: " + str(nltk_textcollection.idf("CCC"))
print "tf: " + str(nltk_textcollection.tf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))
print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))

# Do that now systematically for all documents
print "Document where tf is bigger 0:"
cnt = 0
for doc in xmlcollection.get_docs():
    tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens()))
    stdout.write(str(tf) + ", ")
    cnt += 1
    if cnt == 10: 
        print
    cnt = 0
    if tf > 0.0: 
        print "\n" + doc.get_xml_filename()
Ejemplo n.º 26
0
# -*- coding: utf-8 -*-
from nltk.text import TextCollection
__author__ = 'Alan Hou'

corpus = TextCollection(['this is sentence one', 'this is sentence two', 'this is sentence three'])
# 直接算出 tfidf
print(corpus.tf_idf('this', 'this is sentence four'))
Ejemplo n.º 27
0
def prepare_Custom(data, common, count, training=True):
    X = []
    Y = []
    wordsid = ["" for x in range(count)]
    common = {k: common[k] for k in list(common)[:count]}
    default = np.zeros(count)
    i = 0
    for word in common:
        default[i] = 0
        wordsid[i] = word
        i = i + 1

    wordWeights = []
    for i in range(count):
        wordWeights.append(1 / (i + 1))

    wordWeights = np.array(wordWeights)

    if (training):
        global Text
        Text = []
        for data_point in data:
            Text.append(data_point['original_text'].lower())

    corpus = TextCollection(Text)

    for data_point in data:
        occur = default
        x = []
        Tf_ide = []
        data_point["num_words"] = len(data_point['text'])
        data_point['sentiment'] = np.abs(
            nltk_sentiment(data_point['original_text']))
        data_point['exclam'] = data_point['original_text'].count('!')
        data_point['hash'] = data_point['original_text'].count('#')
        popularScore = 0
        for word in data_point['text']:
            if word in wordsid:
                occur[wordsid.index(word)] += 1
                popularScore += 1
        for word in common:
            # tf-idf features
            x.append(corpus.tf_idf(word, data_point['original_text']))

        # popularity-frequency (normalized)
        x.append(popularScore / data_point["num_words"])
        # transform(children) feature
        x.append(np.log(data_point['children']**2 + 1))
        # not new
        x.append(data_point['controversiality'])
        # not new
        x.append(int(data_point['is_root']))
        # compute an index of common words
        x.append(occur.dot(wordWeights.T))
        # sentiment analysis transofmed
        x.append(np.log(np.abs(data_point['sentiment'])**2 + 1))
        # count of exclamation points
        x.append(data_point['original_text'].count('!'))
        # count of hashtags
        x.append(data_point['original_text'].count('#'))
        x.append(1)
        X.append(x)
        Y.append(data_point['popularity_score'])

    return np.array(X), np.array(Y)
Ejemplo n.º 28
0
]
sents = [word_tokenize(sent) for sent in sents]  # 对每个句子进行分词
print(sents)  # 输出分词后的结果
corpus = TextCollection(sents)  # 构建语料库
print(corpus)  # 输出语料库

# 计算语料库中"one"的tf值
tf = corpus.tf('one', corpus)  # 1/12
print(tf)

# 计算语料库中"one"的idf值
idf = corpus.idf('one')  # log(3/1)
print(idf)

# 计算语料库中"one"的tf-idf值
tf_idf = corpus.tf_idf('one', corpus)
print(tf_idf)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

x_train = [
    'TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景',
    '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要'
]
x_test = ['原始 文本 进行 标记', '主要 思想']

# 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer(max_features=10)
# 该类会统计每个词语的tf-idf权值
tf_idf_transformer = TfidfTransformer()
Ejemplo n.º 29
0
import pymongo
from pymongo import Connection
MONGODB_PORT = 27017
import nltk
from nltk.corpus import brown
from nltk.text import TextCollection
mongodb=Connection("localhost", MONGODB_PORT)['cablegate']
browntext = TextCollection(brown.words(categories=['news','government']))
count=0
for ng in mongodb.ngrams.find(timeout=False):
	mongodb.ngrams.update({"_id":ng["_id"]},{"$set":{"tfidf": browntext.tf_idf(ng['label'],brown.words(categories=['news','government'])) }})
	count+=1
	print "updated tfidf for %d topics"%count
Ejemplo n.º 30
0
        if i in string.punctuation:  # 如果字符是标点符号的话就将其替换为空格
            s['text'] = s['text'].replace(i, " ")
    sentence.append(s['text'])

sents = [word_tokenize(sent) for sent in sentence]

corpus = TextCollection(sents)

tf_idf = []
for sen in sents:
    td = []
    for data in sen:
        elem = []
        data = data.lower()
        if data not in stop_words:
            td.append(corpus.tf_idf(data, corpus))
    tf_idf.append(td)

# cosine = []
count_s = []
aspect_cosine = []
for i in range(len(sents)):
    sentences_vector = []
    for w in sents[i]:
        w = w.lower()
        if w not in stop_words:
            try:
                word_index = list(words_index.keys())[list(
                    words_index.values()).index(w)]
                sentences_vector.append(wordVectors[word_index])
            except ValueError:
topN = []
for i in eliminateBiggerThanOne[:2000]:
    topN.append(i[0])
print('topN' , topN)
########################################################################################################################



## Getting TF-IDF of the TopN words to get max and min##
s = [d for (d,c) in sentences]
tfList = []
for sen in s:
    for word in sen:
        if word in topN:
            tfList.append(x.tf_idf(word,sen))

print(len(tfList))
max= max(tfList)
print('max' ,max)
min= min(tfList)
print('min' , min)
res = max - min
print('res' , res)
half = res/2
print('half',half)
twoThird = (max+half)/2
print('twoThird' , twoThird)
quarter = half/2
print('quarter',quarter)
Ejemplo n.º 32
0
def alignText(simpleParas, normalParas, pairedPara): 
    #print simpleParas, len(simpleParas)
    #print normalParas, len(normalParas)
    for key,value in pairedPara.items(): # key is simple and value in normal        
        SPara = simpleParas[key]
        NPara = normalParas[value]
        print "=================Paragraphs were above======================================"
        # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list
        colList, sslist,nslist = formSentenceList(SPara,NPara)
        collection = TextCollection(colList)

        # this is a list of Word object
        wordsWithWeight = []

        dict={}
        for sentence in colList:
            weight = 0
            for term in sentence:
                if term not in PUNCTLIST or term not in STOPWORDS or term not in commonAuxilaryVerbs:
                    weight = collection.tf_idf(term,sentence)
                    # what if the term is already in the dic, we need to add the weight
                    if(term not in dict):
                        w = Word(term,"","")
                        w.setWeight(weight)
                        wordsWithWeight.append(w)
                        #dict[term] = weight
                    # dict[term] = weight
            
            #dict = sortDict(dict)
        temp=[]
        for sentence in sslist:
            tokSen = word_tokenize(sentence)
            temp.append(tokSen)
        sslist = temp
        temp=[]
        for sentence in nslist:
            tokSen = word_tokenize(sentence)
            temp.append(tokSen)
        nslist = temp

        
        for simpleLine in sslist:
            stringSimpleLine = listToString(simpleLine)
            # semantic part
            simplefilename = "sentence1.txt"
            SFile=open(simplefilename,"w+")
            SFile.write(stringSimpleLine)
            SFile.close()
            parseFile("sentence1.txt")
            # if failed to parse, skip this sentence and continue
            if verifyParsedFile("parsedsentence1.txt")  == False:
                continue

            buildClause("parsedsentence1.txt", "one")
            # end semantic part
            maxSimilarity = 0
            for normalLine in nslist:
                stringNormalLine = listToString(normalLine)
                # semantic part
                normalfilename = "sentence2.txt"
                NFile=open(normalfilename,"w+")
                NFile.write(stringNormalLine)
                NFile.close()
                parseFile("sentence2.txt")
                #check whether parsing was done properly
                # if failed to parse, skip this sentence and continue
                if verifyParsedFile("parsedsentence2.txt")  == False:
                    continue

                # end semantic part

                #buildClause("parsedsentence1.txt", "one")
                buildClause("parsedsentence2.txt","two")
                sentence1Words = []
                sentence2Words = []
                #makeContextFile(n1,v1,n2,v2)
                
                sentence1Words, sentence2Words = makeContextFile(n1,v1,n2,v2)
                # all words is a dictionary of words:tfidf. I converted this to a dictionary from a list of wordsWithWeight for convenience 
                allWords = {}
                for w in wordsWithWeight:
                    allWords[w.getValue()]=w.getWeight()
                numerator1 = 0
                denominator1 = 0
                for word in sentence1Words:
                    if(word.getValue() in allWords):
                        tfidf = allWords[word.getValue()]
                        semanticWeight = word.getWeight()
                        numerator1 = numerator1+ (semanticWeight*tfidf)
                        denominator1 = denominator1 + allWords[word.getValue()]
                if(denominator1==0):
                    denominator1 = 1
                partA = numerator1/denominator1
                numerator2 = 0
                denominator2 = 0
                for word in sentence2Words:
                    #print "dic index:->", word.getValue(),"value: ",allWords[word.getValue()]
                    if(word.getValue() in allWords): 
                        tfidf = allWords[word.getValue()]
                        semanticWeight = word.getWeight()
                        numerator2 = numerator2+ (semanticWeight*tfidf)
                        denominator2 = denominator2 + allWords[word.getValue()]
                if(denominator2==0):
                    denominator2 = 1
                partB = numerator2/denominator2
                

                SIMILARITY = (partA + partB)/2
                print "><><><><><><><><><><><><><><><><><><><><><><"
                print stringSimpleLine
                print "--------------------------------------------"
                print stringNormalLine
                print "Similarity Score -----> ", SIMILARITY
                print "><><><><><><><><><><><><><><><><><><><><><><"
Ejemplo n.º 33
0
from __future__ import print_function
from nltk.corpus import PlaintextCorpusReader
from nltk.text import TextCollection

#load all the files in the corpus root,
#and calculate tf, idf, and tf_idf on them, and on a specific term

if __name__ == "__main__":
    corpus_root = '../data/source_data'
    corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt')

    ids = corpus.fileids()

    collection = TextCollection(corpus)

    #for x,word in enumerate(corpus.words(ids[0])[:200]):
    #    print(x,word)

    source = ids[0]
    term = corpus.words(source)[107]
    doc = corpus.words(ids[2])



    print("Source: ",source)
    print("TF of: ",term,": ",collection.tf(term,doc))
    print("IDF of: ",term,": ",collection.idf(term))
    print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))

Ejemplo n.º 34
0
#coding:utf-8

import nltk
from nltk.text import TextCollection

data = "Hello world!"

tokens = nltk.word_tokenize(data)
print(tokens)
print("---------------------------------------")
corpus = TextCollection(
    ['this is sentence one', 'this is sentence two', 'this is sentence three'])
print(corpus.tf_idf("this", "this is sentence four"))

import numpy as np
from numpy import dot

a = np.array([1, 0])
p = np.array([[.9, .1], [.5, .5]])
n = dot(a, p)
for i in range(1000):
    # n = dot(a,p)
    n = dot(n, p)

print("res::", n)
class WeightedTweetClassifier(TweetClassifier):
    """
    Basic idea:
    train TF-IDF model on training data
    filter out all words that we do not have clues for
    multiply all remaining term weights with the corresponding clues (+1, -1, 0), and sum the results
    """
    def __init__(self, dictfile=None, trainfile=None, datafile=None, outfile=None):
        # Call the superclass constructor
        super(WeightedTweetClassifier, self).__init__(trainfile, datafile, outfile)
        self.stemmer = PorterStemmer()

        self.trainfile = trainfile
        self.datafile = datafile
        self.outfile = outfile

        #this contains the clues we were given: {"clue":1.0, "clue2":-1.0 ... }
        self.clueValues = {}

        #the NLTK TextCollection class is used because it provides TF-IDF functionality.
        self.textCollection = None

        # read the clues
        self.readDictionary(dictfile)

        # for saving sentiment scores, so they can be meaningfully used later on by e.g. the Joint Classifier
        self.scores = {}

    def readDictionary(self, dictfile=None):
        """
        read the dictionary file. +1, -1 or 0 is saved as a sentiment for each (stemmed) term in self.clueValues

        TODO: maybe we don't want to stem, but instead use the provided POS tags? could be a separate classifier though
        """
        with open(dictfile, "r") as dictdata:
            for line in dictdata.readlines():
                fields = line.split(" ")
                token = self.stemmer.stem(fields[2].split("=")[1].strip())
                polarity = fields[5].split("=")[1].strip()
                self.clueValues[token] = (1.0 if polarity == "positive" else (-1.0 if polarity == "negative" else 0.0))

    def train(self, trainfile=None):
        print "training WeightedTweetClassifier"
        self.readTrainingData((trainfile or self.trainfile))
        for tweet in self.trainingTweets:
            # lowercase, remove punctuation
            nopunct = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation))
            tweet.tweet = nopunct
        # add all Tweets to our TextCollection. This automatically creates a TF-IDF model
        self.textCollection = TextCollection([tweet.tweet for tweet in self.trainingTweets])

    def classifyTweets(self, datafile=None, outfile=None):
        print "reading dataset"
        self.readDataset(datafile)

        print "classifying Tweets with weighted classifier"
        for tweet in self.evalTweets:
            # score = sum of TF-IDF weighted terms which carry sentiment
            tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ")
            score = sum([self.textCollection.tf_idf(token, tweet.tweet) * self.clueValues.get(self.stemmer.stem(token), 0)
                         for token in tokens])
            self.scores[(tweet.id1, tweet.id2)] = score

            # Any score very close or equal to 0 is judged to be neutral.
            tweet.sentiment = ("neutral" if abs(score) < 0.01 else ( "negative" if score < 0 else "positive"))
Ejemplo n.º 36
0
        aspect_keywords.append(keywords_vector)

sentence = []
for s in sentences:
    for i in s['text']:
        if i in string.punctuation:  # 如果字符是标点符号的话就将其替换为空格
            s['text'] = s['text'].replace(i, " ")
    sentence.append(s['text'])

sents = [word_tokenize(sent) for sent in sentence]

corpus = TextCollection(sents)

tf_idf = []
for sen in sents:
    td = []
    for data in sen:
        elem = []
        data = data.lower()
        if data not in stop_words:
            # print(data)
            td.append(corpus.tf_idf(data, corpus))
    tf_idf.append(td)

for aspect in aspect_keywords:
    for vector in aspect[1:]:
        print(
            deal_data.cosine(aspect[0], vector) *
            corpus.tf_idf('food', corpus))
    print('\n')
Ejemplo n.º 37
0
arff.write("@relation sentiment_analysis\n\n")
arff.write("@attribute numPosEmots numeric\n")
arff.write("@attribute numNegEmots numeric\n")
arff.write("@attribute numQuest numeric\n")
arff.write("@attribute numExclam numeric\n")
arff.write("@attribute numPosGaz numeric\n")
arff.write("@attribute numNegGaz numeric\n")
for word in words:
	arff.write("@attribute word_")
	sub_w = re.subn('[^a-zA-Z]', 'X', word)
	arff.write(sub_w[0])
	if sub_w[1] > 0:
		arff.write('_' + str(wc))
		wc += 1
	arff.write(" numeric\n")
arff.write("@attribute class {POS, NEG, OTHER}\n\n")
arff.write("@data\n")

# data
for i in xrange(len(tweets)):
	arff.write(str(emots_count[i][0]) + ',' + str(emots_count[i][1]) + ',')
	arff.write(str(punct_count[i][0]) + ',' + str(punct_count[i][1]) + ',')
	arff.write(str(gaz_count[i][0]) + ',' + str(gaz_count[i][1]) + ',')
	
	for j in xrange(len(words)):   #loop through unigrams
		arff.write(str(texts.tf_idf(words[j], tweets[i])) + ',')
	
	arff.write(sentiments[i] + '\n')

arff.close()
print '\nFinished pre-processing! The ARFF file for Weka has been created.'