Example #1
0
def compute_tfidf(text,filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara
    
    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)
     
    for paraList in colList:
        dict={}
        for term in paraList:
            print term, "has weight: ", collection.tf_idf(term,paraList)
            dict[term]= collection.tf_idf(term,paraList)
        '''
        print "BEFORE  <><><><><<><<>><><><><><><><>><><  ",type(dict)
        for key,value in dict.iteritems():
            print key," ",value
        '''
        d=sortDict(dict)
        print "AFTER SORTED  <><><><><<><<>><><><><><><><>><><  ",type(d)
        textFile=open(filename,"a")
        textFile.write("\n")
        for key,value in d:
            s = str(key) + "\t" + str(value)+"\n"
            textFile.write(s)
Example #2
0
    def __vectorize(self, corpus):
        corpus = [list(self.__tokenize(doc)) for doc in corpus]

        texts = TextCollection(corpus)

        for doc in corpus:
            yield {term: texts.tf_idf(term, doc) for term in doc}
Example #3
0
def compute_tfidf(text, filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara

    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)

    for paraList in colList:
        dict = {}
        for term in paraList:
            print term, "has weight: ", collection.tf_idf(term, paraList)
            dict[term] = collection.tf_idf(term, paraList)
        '''
        print "BEFORE  <><><><><<><<>><><><><><><><>><><  ",type(dict)
        for key,value in dict.iteritems():
            print key," ",value
        '''
        d = sortDict(dict)
        print "AFTER SORTED  <><><><><<><<>><><><><><><><>><><  ", type(d)
        textFile = open(filename, "a")
        textFile.write("\n")
        for key, value in d:
            s = str(key) + "\t" + str(value) + "\n"
            textFile.write(s)
Example #4
0
def sentenceAlignment(simpleParas, normalParas, pairedPara):
    for key,value in pairedPara.items(): # key is simple and value in normal
        print "**********************************"
        print "PARAGRAPH"
        print "##################################"
        
        SPara = simpleParas[key]
        NPara = normalParas[value]
        
        # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list
        colList, sslist,nslist = formSentenceList(SPara,NPara)
        collection = TextCollection(colList)

        dict={}
        for sentence in colList:
            weight = 0
            
            for term in sentence:
                weight = collection.tf_idf(term,sentence)
                print "TERM -> ",term, "is",weight
                # what if the term is already in the dic, we need to add the weight
                if(term not in dict):
                    dict[term] = weight
                # dict[term] = weight
            
            #dict = sortDict(dict)
        print "================================================================"    

        '''
Example #5
0
    def get_tf_idf_dict_nltk(
            self,
            column_type="review_body",
            save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"):
        '''
            ### nltk version
            it's super slow so don't use it
        '''
        reviews = self.raw_df[column_type].tolist()

        # get clean header
        reviews_list_cleaned = clean_tsv(reviews)

        # get all words
        words = set()
        for reviews in reviews_list_cleaned:
            for review in reviews:
                words.add(review)

        words = list(words)

        corpus = TextCollection(reviews_list_cleaned)

        tf_idf = []
        for word in words:
            tf_idf.append(corpus.tf_idf(word, corpus))

        df = pd.DataFrame({"word": words, "tf-idf": tf_idf})
        df.to_csv(save_path, encoding='utf-8')
Example #6
0
def vectorize_t(corpus):
    #corpus = [tokenize(doc) for doc in corpus]
    texts = TextCollection(corpus)
    return {
        term: texts.tf_idf(term, corpus)
        for term in corpus
    }
Example #7
0
def ranking(reuters, corpus, docids, palavras):
    '''Cria um ranqueamento entre os textos da busca, sendo o primeiro o mais relevante

    Args:
        reuters: corpus vindo do nltk
        corpus: dicionário contendo a relação entre índice e texto
        docids: índices dos textos buscados
        palavras: palavras tokenizadas da query
    
    Returns:
        Lista com todas os índices já ranqueados
    '''
    rank = {}
    tc = TextCollection(reuters)

    for e in docids:
        rank[e] = 0
        for i in palavras:
            rank[e] += tc.tf_idf(i, corpus[e])

    rank = {
        k: v
        for k, v in reversed(sorted(rank.items(), key=lambda item: item[1]))
    }
    return rank.keys()
Example #8
0
def nltk_tf_idf(corpus_one, file_name):
    print('-----starting nltk_tf_idf')
    corpus_one = [nltk.word_tokenize(doc) for doc in corpus_one]
    texts = TextCollection(corpus_one)

    for doc in corpus_one:
        yield {term: texts.tf_idf(term, doc) for term in doc}
Example #9
0
	def getDomainUnigram(self, directory = None):		
		collocations = set()  #collocation items
		ewordlists = list() #list of lists of words
		
		#extract words from essays
		if directory is not None:
			doclist = os.listdir(directory)
			for essay in doclist:
				dir_essay  = directory+'/'+essay
				etext = open(dir_essay,'r').read()
				tokens = nltk.wordpunct_tokenize(etext)
				tokens = [word.lower() for word in tokens]
				#stemming
				if self._stemoption ==True:
					st = PorterStemmer()
					tokens = [st.stem(t) for t in tokens]
				
				#extract the collocation for the given essay
				e_bigram = set(Mytext(tokens).collocations())
				collocations = collocations | e_bigram
				ewordlists.append(tokens)
				
		else: # using the mapped essay to calcuate the candidate bigrams
			#need to call mapessay fuction first
			for ins in self._data:
				if ins['essay'] is not None:
					etext = open(ins['essay'],'r').read()
					tokens = nltk.wordpunct_tokenize(etext)
					tokens = [word.lower() for word in tokens]
					#stemming
					if self._stemoption ==True:
						st = PorterStemmer()
						tokens = [st.stem(t) for t in tokens]
				
					#extract the collocation for the given essay
					e_bigram = set(Mytext(tokens).collocations())
					collocations = collocations | e_bigram
					ewordlists.append(tokens)
		
		#get collection of all essays under the specified directory / associated essays
		collection_text = TextCollection(ewordlists)
		
		itemlist = list()
		for (a, b) in collocations:
			itemlist.append(a)
			itemlist.append(b)
			
		itemlist = list(set(itemlist))	
		
		word_idf = []
		for i in range(len(itemlist)):
			word_idf.append((collection_text.idf(itemlist[i]), itemlist[i]))	
		
		word_idf = sorted(word_idf, key = operator.itemgetter(0))
		ave = 0
		if len(word_idf)!=0:
			ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf)
			
		wlist =  [j for (i, j) in word_idf if i<ave]				
		return wlist
 def tf_idf(self):
     corpus = [
         list(self.cr.tokenize_strip_punct(desc))
         for desc in self.cr.texts()
     ]
     texts = TextCollection(corpus)
     for desc in corpus:
         yield {term: texts.tf_idf(term, desc) for term in desc}
Example #11
0
def nltk_tfidf_vectorize(corpus):
    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {term: texts.tf_idf(term, doc) for term in doc}
Example #12
0
def compute_tf_idf(question, messages):
    import math

    texts = [question.keywords]
    total_length = 0
    for m in messages:
        total_length += len(m.keywords)
        text = Text(tokens=m.keywords)
        texts.append(text)
    text_collection = TextCollection(texts)
    question_tfidf_score = 0
    for k in question.keywords:
        tf_idf = text_collection.tf_idf(k, texts[0])
        question_tfidf_score += tf_idf

    if question_tfidf_score == 0:
        question_tfidf_score = 0.2
    if total_length == 0:
        total_length = 1
    length_factor = len(question.keywords) / total_length
    score = length_factor * math.log2(question_tfidf_score * 10)
    base_score = score
    if base_score == 0:
        base_score = 1

    print(question.content, question_tfidf_score, length_factor, score)
    print("^^^^^^^^^^^^^^^^^^^^^^^^^^")
    scores = []
    total_score = score
    print("Math", math)
    for i in range(0, len(messages)):
        tf_idf_i = 0
        for k in messages[i].keywords:
            tf_idf = text_collection.tf_idf(k, texts[i + 1])
            tf_idf_i += tf_idf
        if tf_idf_i == 0:
            continue
        length_factor = len(messages[i].keywords) / total_length
        score = length_factor * math.log2(tf_idf_i * 10)
        scores.append(score)
        total_score += score
        print(messages[i].content, tf_idf_i, length_factor, score)
        print("++++++++++++++++++++++++++++++++")
        # print(scores)
    averaged_scores = []
    last_message = question
    results = [last_message]
    for i in range(0, len(scores)):
        averaged_score = scores[i] / base_score
        averaged_scores.append(averaged_score)
        if averaged_score < 0.52:
            last_message.comments.append(messages[i])
        else:
            last_message = messages[i]
            results.append(last_message)
    print(averaged_scores)
    return results
Example #13
0
def vectorize(corpus):
    corpus_tokenized = [list(tokenize(doc)) for doc in corpus]
    texts  = TextCollection(corpus_tokenized)
    
    for doc in corpus_tokenized:
        return {
            term: texts.tf_idf(term, doc)
            for term in doc
        }
Example #14
0
def tf_idf_vectorize_nltk(corpus):
    print(corpus)
    #corpus = [tokenize(doc) for doc in corpus]
    texts  = TextCollection(corpus)
    print(texts)
    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }
Example #15
0
def build_text_collections():
    text_collections = {}
    sample_size = 4
    for category in ["news", "learned", "fiction"]:
        texts = []
        for fileid in nltk.corpus.brown.fileids(
                categories=category)[:sample_size]:
            texts.append(tokenize(nltk.corpus.brown.raw(fileid)))
        text_collections[category] = TextCollection(texts)
    text_collections["all"] = TextCollection(text_collections.values())
    return text_collections
Example #16
0
class TextIndexer:
    
    __textCollection = None
    
    def __init__(self, documents):
        self.__textCollection = TextCollection(documents)
        
    def idf(self, term):
        return self.__textCollection.idf(term)
    
    def tf(self, term, text):
        return self.__textCollection.tf(term, text)
 def __init__(self, pairs, mode='eng', stopwords_flag=True):
     self.pair_dict = {}
     self.ids = [pair[0] for pair in pairs]
     self.tfidfs = []
     self.mode = mode
     self.stopwords_flag = stopwords_flag
     docs = [pair[1] for pair in pairs]
     self.docs = [self.preprocess(doc) for doc in docs]
     for id, text in zip(self.ids, self.docs):
         self.pair_dict[id] = text
     self.corpus = TextCollection(self.docs)
     self.query = []
 def train(self, trainfile=None):
     print "training WeightedTweetClassifier"
     self.readTrainingData((trainfile or self.trainfile))
     for tweet in self.trainingTweets:
         # lowercase, remove punctuation
         nopunct = string.lower(
             tweet.tweet.translate(string.maketrans("", ""),
                                   string.punctuation))
         tweet.tweet = nopunct
     # add all Tweets to our TextCollection. This automatically creates a TF-IDF model
     self.textCollection = TextCollection(
         [tweet.tweet for tweet in self.trainingTweets])
Example #19
0
 def tfidf_extraction(self, subset=None):
     if subset is not None:
         data = self.data[subset]
     else:
         data = self.data
     get_idf = TextCollection(data.Tokenize.to_list())
     word_list = list(set([w for l in data.Tokenize.to_list() for w in l]))
     full_winfo = [[word, idf, tag[1]] for word, idf, tag in zip(word_list, [get_idf.idf(i) for i in word_list], nltk.pos_tag(word_list))]
     self.keywords = pd.DataFrame([i for i in full_winfo if i[2] in ["JJ", "NNP", "VBP", 'VBG', 'VBD', 'VBN', 'CD', 'NN', 'NNPS', 'RB', 'IN'] 
                                   and not is_number(i[0])], columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False)
     self.full_words = pd.DataFrame(full_winfo, columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False)
     self.enable_topk == True
Example #20
0
def run_main():
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terrible movie'

    tf_analy = TextCollection([text1, text2, text3, text4, text5])

    new_text = 'That one is a good movie. This is so good!'
    word = 'That'
    tf_idf_val = tf_analy.tf_idf(word, new_text)
    print(tf_idf_val)
Example #21
0
def Generate_keyword(obj,length):
    orig_file = './Data/'+obj+'/'+obj+'.xlsx'
    data = xlrd.open_workbook(filename=orig_file)
    sheet = data.sheet_by_index(1)
    review_head = np.array(sheet.col_values(12))[1:]
    review_body = np.array(sheet.col_values(13))[1:]
    
    review_all=[]
    for i in range(length) :
        review = review_head[i] + " " +review_body[i]
        review_all.append(review)
    review_all = np.array(review_all)
    
    # make review tokens
    tokens=[]
    for i,review in enumerate(review_all):
        review = review.lower()
        replacer = RegexpReplacer()
        review = replacer.replace(review)
        remove = str.maketrans('','',string.punctuation) 
        review = review.translate(remove)
        token = nltk.word_tokenize(review)
        token = [w for w in token if w == 'not' or 
                 not w in stopwords.words('english')] 
        s = nltk.stem.SnowballStemmer('english')  
        token = [s.stem(ws) for ws in token]
        tokens.append(token)
    token_file = './Data/'+ obj +'/tokens.pkl'
    f=open(token_file,'wb')
    pickle.dump(tokens,f)
    f.close()
    
    corpus=TextCollection(tokens) 
    
    tf={}
    tf_idf={}
    for review in tokens:
        for word in review:
            if word not in tf :
                tf_=corpus.tf(word,corpus)
                tf[word]=tf_
            if word not in tf_idf :
                tf_idf_=corpus.tf_idf(word,corpus)
                tf_idf[word] = tf_idf_
                
    tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True)
    tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1],
                           reverse=True)
    
    pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv')
    pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
Example #22
0
def calculate_idf(words, corpus):
    """
    Calulate the idf of words by using a corpus
    :param words:  The words to calculate their idf
    :param corpus: The corpus to use in calculation
    :return:       dict of {word: idf}
    """
    words = set(words)
    # print("Loading corpus to calculate idf...")
    corpus_colleciton = TextCollection(corpus)
    idfs = {}
    for word in words:
        idfs[word] = corpus_colleciton.idf(word)
    return idfs
Example #23
0
def computeTFIDF_text(texts,
                      singletext):  #texts是句子字符串列表(语料库),singletext单个句子字符串,
    texts = [nltk.word_tokenize(text) for text in texts]  #对句子列表所有句子分词

    corpus = TextCollection(texts)
    words = nltk.word_tokenize(singletext)  #单词列表
    tfidf_words = {}
    #计算机tfidf
    for word in words:
        idf = corpus.idf(word)  #tf
        tf = corpus.tf(word, words)  #idf
        tfidf = idf * tf
        tfidf_words[word] = tfidf
    return tfidf_words
    def preprocess(self,text):
        #text = text.split(" ");
        text = word_tokenize(text)
        if self.display:
            print "After Tokenizing"
            print text
            print "\n\n"

        text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2]
        
        tc = TextCollection([text])
        words = list(set(tc))
        
        word_tf = {word: tc.tf(word, text) * len(text) for word in words}

        return word_tf
Example #25
0
def getEmmaChapter():
    from nltk.text import TextCollection
    # from nltk.text import *
    import nltk
    # from nltk.book import text1, text2, text3
    gutenberg = TextCollection(nltk.corpus.gutenberg)

    # ----- IDF EXAMPLE ----
    # print(gutenberg.idf('Dick'))
    # ----- IDF EXAMPLE ----

    i = 2
    # line 2 to line 166 is chapter 1
    emma = nltk.corpus.gutenberg.sents('austen-emma.txt')
    # for l in emma:
    chapterText = ''
    while i < 167:
        # print(str(i) + ': ')
        k = 0
        l = emma[i]
        line = ''
        for w in l:
            line += l[k] + ' '
            k = k + 1
        # print(str(i) + ': ' + line + '\n')
        chapterText += line + '\n'
        i = i + 1

    print (chapterText)
    return
Example #26
0
def attrexplore(corpus):
    # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.."
    # ss = SenToken(raw=s)
    # print(ss)
    # for sent in ss:
    #     print(sent)

    nltkCorpus = TextCollection(corpus)
    print(nltkCorpus.idf(term='this'))

    print(idf(term='this', corpus=corpus))

    print(nltkCorpus.tf(term='this', text='this is sentence four'))
    print(tf_idf(term='this', doc='this is sentence four', corpus=corpus))
    fdist = nltk.FreqDist(WordTokener(sent=corpus[0]))
    print(fdist.tabulate())
Example #27
0
def main():
    # Get text from folder or file
    # TODO Change the folder corpus to the upper level!
    texts = load_text_data(config_path)
    if not texts:
        print "No texts found"
        return
    # Dictionary that will hold all the ngrams and their values, for each measure (dict of dicts)
    scored_ngrams = {}
    # Create a list of Document objects with the texts. Pretreat them also.
    list_documents = []

    for label, text in texts.iteritems():
        list_documents.append(Document(text, stem=config_stem, name=label))

    # list_documents = TextCollection([Document(text, stem=config_stem, name=label)
    #                                  for label, text in texts.items()][:])
    list_documents = TextCollection(list_documents)
    global config_ngram
    if config_ngram == 0:
        config_ngram = 1

    #########################################N GRAM EXTRACTION #################################################

    # Now do the ngram extraction

    for ng in range(2, config_ngram + 1):
        ngrams = get_any_ngrams(list_documents, ngram=ng, k=config_top_k,
                                min_tok_len=config_min_tok_len, min_freq=config_min_tok_freq)

        scored_ngrams = update_dict_values(scored_ngrams, ngrams)

    scored_ngrams = update_dict_values(scored_ngrams, get_concordances(list_documents, scored_ngrams))
    make_tables(scored_ngrams, results_folder=config_output)
    return
Example #28
0
def getTopic2(text):
    # clean input
    stop = open('stopwords.txt').read()
    l = []
    src = [
        w.strip(" .,?!") for w in nltk.word_tokenize(text.lower())
        if w not in stop
    ]
    candidates = nltk.FreqDist(w for w in src if len(w) > 3)
    candidates = candidates.keys()[:10]

    # initialize vectors
    brown = TextCollection(nltk.corpus.brown)
    for w in candidates:
        l.append((w, brown.tf_idf(w, candidates)))
    vectors = [array(l)]

    # initialize the clusterer
    clusterer = nltk.cluster.kmeans.KMeansClusterer(10, euclidean_distance)
    clusterer.cluster(vectors, True)

    #pick the one closest to the center of the largest
    clusterer.Means().Max()
    o = [l for l in clusterer.Means()]
    #o = [(clusterer.classify(l.index(i)), l.index(i)) for i in range(len(l))]
    o.reverse()
    print o.pop().index(1)
Example #29
0
def text_classification():
    """
    文本分类
    :return:
    """
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terrible movie'

    # 构建TextCollection对象
    tc = TextCollection([text1, text2, text3, text4, text5])
    new_text = 'That one is a good movie. This is so good!'
    word = 'That'
    tf_idf_val = tc.tf_idf(word, new_text)
    print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
Example #30
0
def retrieve_results(n_percentile):
    search_queries = parse_trec('documents/irg_queries.trec')
    search_collections = parse_trec('documents/irg_collection_clean.trec')
    # search_collections = parse_trec('documents/irg_collection_short.trec')
    # search_collections = eliminate_stopwords(search_collections)
    # write_collection_doc(search_collections, 'documents/irg_collection_clean.trec')

    print('======= Statistics =======')
    print(f'Queries: {len(search_queries)}')
    print(f'Collections: {len(search_collections)}')
    print(f'Removal of {int((1-n_percentile)*100)}%-ile')
    print('==========================')

    # TF-IDF
    document_results = []
    for search_query_id, search_query_text in search_queries.items():
        print(
            f'Current query id: {search_query_id}, text: "{search_query_text}"'
        )
        terms = search_query_text.split(' ')
        documents = keep_n_percentile_most_relevant_words(search_collections,
                                                          search_query_text,
                                                          n=n_percentile)
        document_scores = {}
        search_texts_collection = TextCollection(documents.values())
        for document_id, document_text in documents.items():
            for term in terms:
                current_score = document_scores.get(document_id, 0.0)
                document_scores[
                    document_id] = current_score + search_texts_collection.tf_idf(
                        term, document_text)

        rank = 1
        for document_id, document_scores in sorted(document_scores.items(),
                                                   key=lambda kv: kv[1],
                                                   reverse=True):
            if rank <= 1000:
                document_results.append(
                    Result(search_query_id, document_id, rank,
                           document_scores))
                rank += 1

    result_writer(document_results,
                  f'IE_result_keep_{int(n_percentile*100)}_percentile.trec')
    print('Done')
 def train(self, trainfile=None):
     print "training WeightedTweetClassifier"
     self.readTrainingData((trainfile or self.trainfile))
     for tweet in self.trainingTweets:
         # lowercase, remove punctuation
         nopunct = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation))
         tweet.tweet = nopunct
     # add all Tweets to our TextCollection. This automatically creates a TF-IDF model
     self.textCollection = TextCollection([tweet.tweet for tweet in self.trainingTweets])
    def preprocess(self, text):
        #text = text.split(" ");
        text = word_tokenize(text)
        if self.display:
            print "After Tokenizing"
            print text
            print "\n\n"

        text = [
            w.strip().lower() for w in text
            if not w.strip() in ENGLISH_STOPWORDS and len(w.strip()) > 2
        ]

        tc = TextCollection([text])
        words = list(set(tc))

        word_tf = {word: tc.tf(word, text) * len(text) for word in words}

        return word_tf
Example #33
0
def compute_tf_idf_similarity(query: str, content: str, type: str) -> float:
    """
    Compute the mean tf-idf or tf
     similarity for one sentence with multi query words.
    :param query: a string contain all key word split by one space
    :param content: string list with every content relevent to this query.
    :return: average tf-idf or tf similarity.
    """
    sents = [word_tokenize(content),
             word_tokenize("")]  # add one empty file to smooth.
    corpus = TextCollection(sents)  # 构建语料库

    result_list = []
    for key_word in query.strip(" ").split(" "):
        if type == "tf_idf":
            result_list.append(corpus.tf_idf(key_word, corpus))
        elif type == "tf":
            result_list.append(corpus.tf(key_word, corpus))
        else:
            raise KeyError

    return sum(result_list) / len(result_list)
Example #34
0
def compute_tfidf(text,filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara
    
    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)
     
    for paraList in colList:
        dict={}
        for term in paraList:
            dict[term]= collection.tf_idf(term,paraList)
        d=sortDict(dict)
        textFile=open(filename,"a")
        textFile.write("\n")

        for key,value in d:
            s = str(key) + "\t" + str(value)+"\n"
            textFile.write(s)
    def __init__(self, doc: str, vec_size: int, alpha=0.06):
        @has_vec_set(doc)
        def get_vec_set(doc_vec):
            res = {}
            cur_line = 0
            # tc = TextCollection(self.doc)
            while 1:
                try:
                    cur_words = tf_idf_sort(doc_vec.doc, doc_vec.tc, cur_line)
                    for w, v in cur_words:
                        if w in res:
                            res[w] = max(res[w], v)
                        else:
                            res[w] = v
                except IndexError:
                    break
                cur_line += 1
                print("{} \r".format(cur_line), end='')
            return res

        self.doc = read_comments(doc)
        self.tc = TextCollection(self.doc)
        self.vec_set = get_vec_set(self)
        self.vec_set = [(w, self.vec_set[w]) for w in self.vec_set]
        self.vec_set = DataFrame(self.vec_set)
        Max = self.vec_set[1].max()
        Min = self.vec_set[1].min()
        self.vec_set[1] = self.vec_set[1].apply(lambda x: (x - Min) /
                                                (Max - Min))
        self.vec_set[1] = self.vec_set[1].apply(lambda x: x * (1 - alpha))
        self.vec_set = zip(self.vec_set[0], self.vec_set[1])
        self.vec_set = {w: v for w, v in self.vec_set}
        G = Graph(doc, True)
        tex_rank_key_word = DataFrame(key_word(G, 10, 5000))
        Min = tex_rank_key_word[1].min()
        Max = tex_rank_key_word[1].max()
        tex_rank_key_word[1] = tex_rank_key_word[1].apply(
            lambda x: alpha * (x - Min) / (Max - Min))
        tex_rank_key_word = list(
            zip(tex_rank_key_word[0], tex_rank_key_word[1]))
        self.vec_set = [(w, self.vec_set[w]) for w, v in tex_rank_key_word
                        if self.vec_set[w] >= alpha]
        # for w, v in tex_rank_key_word:
        #     if w in self.vec_set:
        #         self.vec_set[w] += v
        #     else:
        #         self.vec_set[w] = v
        # self.vec_set = sorted([(w, self.vec_set[w]) for w in self.vec_set], key=lambda x: x[1], reverse=True)
        self.vec_set = sorted(self.vec_set, key=lambda x: x[1], reverse=True)
        print(len(self.vec_set))
        self.vec_size = vec_size
Example #36
0
def train_NB_tfidf_nltk(train_data,test_data,all_rev):   
    all_rev = [nltk.word_tokenize(rev) for rev in all_rev]
    corpus = TextCollection(all_rev)
    labels = train_data['label']
    train_rev = train_data['review']
    ID = test_data['ID']
    lab = get_lab(labels)
    fs_train = []
    print(train_rev[0])
    for i in range(0,len(train_rev)):
        cut_rev = nltk.word_tokenize(train_rev[i])    
        fs_dict = {}
        for j in range(0,len(cut_rev)):
            fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],train_rev[i])
        fs_train.append((fs_dict,int(lab[i])))
    fs_test = []
    for i in range(0,len(test_rev)):
        cut_rev = nltk.word_tokenize(test_rev[i])    
        fs_dict = {}
        for j in range(0,len(cut_rev)):
            fs_dict[cut_rev[j]] = corpus.tf_idf(cut_rev[j],test_rev[i])
        fs_test.append(fs_dict)
    
    classifier=nltk.NaiveBayesClassifier.train(fs_train)
    label = 1
    train_score = []
    test_score = []
    for i in range(0,len(fs_train)):
        dist = classifier.prob_classify(fs_train[i][0])
        train_score.append(dist.prob(label))
    train_score = np.array(train_score,dtype="float32")
    for i in range(0,len(fs_test)):
        dist = classifier.prob_classify(fs_test[i])
        test_score.append(dist.prob(label))
    test_score = np.array(test_score,dtype="float32")
    print("AUC: ",cal_auc(train_score,lab))
    result = pd.DataFrame({'ID':ID.T,'Pred':test_score.T})
    result.to_csv("./result.csv",index = None)
Example #37
0
csv_read = pd.read_csv("positive-words.csv", header=0)
positive_words = list(csv_read.Positive)
csv_read = pd.read_csv("stopwords.csv", header=0)
stop = list(csv_read.stopwords)

negation = ['no','not','never','n\'t','cannot']
intensify = ['very','really','extremely','absolutely','highly']

""" Create a corpus of text """
reviews=[]
for z in reviewsx:
    n=''.join(x for x in z if x in string.printable)
    o=' '.join(n.split())
    reviews.append(o)

reviewcollection = TextCollection(word_tokenize(r) for r in reviews) #package a list of tokenized reviews
reviewset = [word_tokenize(r) for r in reviews]

""" add the pos/neg lists to a coded dictionary """
subj_dict = {}
for w in negative_words:
    subj_dict[w] = 'NEG'

for w in positive_words:
    subj_dict[w] = 'POS'

rating_dict = {}
rating_dict['NEG']= -1
rating_dict['IRR']= 0
rating_dict['POS']= 1
rating_dict['negate']= 2
Example #38
0
print "Finding forms for the top " + str(no_of_topwords) + \
    " words by edit distance " + \
    str(editdistance) + "; this may take a while!"
xmlcollection.get_words_by_editdistance(editdistance=editdistance,
                                        no_of_most_freq=no_of_topwords)

# Write the found sets to disk; also write most frequent words to disk.
xmlcollection.write_words_by_editdistance(editdistance=editdistance)
xmlcollection.write_topwords(no_of_words=no_of_topwords)
print "Top words written to disk."

# XXX: BIG F**K UP ################################## FIX FIX FIX #####

# Print idf, tf and tf-idf values for the term "CCC", in document
# no. 42 - for testing.
nltk_textcollection = TextCollection(xmlcollection.get_words())
print "idf: " + str(nltk_textcollection.idf("CCC"))
print "tf: " + str(nltk_textcollection.tf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))
print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))

# Do that now systematically for all documents
print "Document where tf is bigger 0:"
cnt = 0
for doc in xmlcollection.get_docs():
    tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens()))
    stdout.write(str(tf) + ", ")
    cnt += 1
    if cnt == 10: 
        print
Example #39
0
def alignText(simpleParas, normalParas, pairedPara): 
    #print simpleParas, len(simpleParas)
    #print normalParas, len(normalParas)
    for key,value in pairedPara.items(): # key is simple and value in normal        
        SPara = simpleParas[key]
        NPara = normalParas[value]
        print "=================Paragraphs were above======================================"
        # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list
        colList, sslist,nslist = formSentenceList(SPara,NPara)
        collection = TextCollection(colList)

        # this is a list of Word object
        wordsWithWeight = []

        dict={}
        for sentence in colList:
            weight = 0
            for term in sentence:
                if term not in PUNCTLIST or term not in STOPWORDS or term not in commonAuxilaryVerbs:
                    weight = collection.tf_idf(term,sentence)
                    # what if the term is already in the dic, we need to add the weight
                    if(term not in dict):
                        w = Word(term,"","")
                        w.setWeight(weight)
                        wordsWithWeight.append(w)
                        #dict[term] = weight
                    # dict[term] = weight
            
            #dict = sortDict(dict)
        temp=[]
        for sentence in sslist:
            tokSen = word_tokenize(sentence)
            temp.append(tokSen)
        sslist = temp
        temp=[]
        for sentence in nslist:
            tokSen = word_tokenize(sentence)
            temp.append(tokSen)
        nslist = temp

        
        for simpleLine in sslist:
            stringSimpleLine = listToString(simpleLine)
            # semantic part
            simplefilename = "sentence1.txt"
            SFile=open(simplefilename,"w+")
            SFile.write(stringSimpleLine)
            SFile.close()
            parseFile("sentence1.txt")
            # if failed to parse, skip this sentence and continue
            if verifyParsedFile("parsedsentence1.txt")  == False:
                continue

            buildClause("parsedsentence1.txt", "one")
            # end semantic part
            maxSimilarity = 0
            for normalLine in nslist:
                stringNormalLine = listToString(normalLine)
                # semantic part
                normalfilename = "sentence2.txt"
                NFile=open(normalfilename,"w+")
                NFile.write(stringNormalLine)
                NFile.close()
                parseFile("sentence2.txt")
                #check whether parsing was done properly
                # if failed to parse, skip this sentence and continue
                if verifyParsedFile("parsedsentence2.txt")  == False:
                    continue

                # end semantic part

                #buildClause("parsedsentence1.txt", "one")
                buildClause("parsedsentence2.txt","two")
                sentence1Words = []
                sentence2Words = []
                #makeContextFile(n1,v1,n2,v2)
                
                sentence1Words, sentence2Words = makeContextFile(n1,v1,n2,v2)
                # all words is a dictionary of words:tfidf. I converted this to a dictionary from a list of wordsWithWeight for convenience 
                allWords = {}
                for w in wordsWithWeight:
                    allWords[w.getValue()]=w.getWeight()
                numerator1 = 0
                denominator1 = 0
                for word in sentence1Words:
                    if(word.getValue() in allWords):
                        tfidf = allWords[word.getValue()]
                        semanticWeight = word.getWeight()
                        numerator1 = numerator1+ (semanticWeight*tfidf)
                        denominator1 = denominator1 + allWords[word.getValue()]
                if(denominator1==0):
                    denominator1 = 1
                partA = numerator1/denominator1
                numerator2 = 0
                denominator2 = 0
                for word in sentence2Words:
                    #print "dic index:->", word.getValue(),"value: ",allWords[word.getValue()]
                    if(word.getValue() in allWords): 
                        tfidf = allWords[word.getValue()]
                        semanticWeight = word.getWeight()
                        numerator2 = numerator2+ (semanticWeight*tfidf)
                        denominator2 = denominator2 + allWords[word.getValue()]
                if(denominator2==0):
                    denominator2 = 1
                partB = numerator2/denominator2
                

                SIMILARITY = (partA + partB)/2
                print "><><><><><><><><><><><><><><><><><><><><><><"
                print stringSimpleLine
                print "--------------------------------------------"
                print stringNormalLine
                print "Similarity Score -----> ", SIMILARITY
                print "><><><><><><><><><><><><><><><><><><><><><><"
Example #40
0
def write_tfidf_file(xmlcollection, nltk_textcollection):
    """
    Writes a tf*idf matrix file with all tf*idf values for each 
    document, row by row. The columns represent the (alphabetically
    ordered) stems available in the whole collection.
    @param xmlcollection: Collection of XML documents, type collection
    @param nltk_textcollection: NLTK TextCollection of all the stems
    """
    idf_file = get_stems_file(measure="_idf")
    avg_words_per_doc = len(xmlcollection.get_words()) / \
                        len(xmlcollection.get_docs())

    if not exists(idf_file):
        write_idf_file(xmlcollection, nltk_textcollection)

    idf_dict = DictFromFile(idf_file)
    tfidf_dict = dict()
    high_tfidf_stems = set()
    
    collection_stems = list(xmlcollection.get_stems(uniq=True))
    print "Length of collection, all stems:", len(collection_stems)
    
    # Remove most frequent (idf<2) / stop stems (or qualifying 
    # as such), and most rare stems (max(idf)), as they are of no 
    # help to separate / make up clusters
    collection_stems = get_classification_stems(collection_stems, idf_dict)
    print "Length of collection, cluster stems:", len(collection_stems)
    
    f = open(get_tfidf_matrix_file(), "w", get_def_enc())
    for doc in xmlcollection.get_docs():
        doc_stems = doc.get_stems()
        col = TextCollection("")
        
        stdout.write(doc.get_id())
        idf_row = ""
        stdout.write(" (")
        for stem in sorted(collection_stems):
            tf = col.tf(stem, doc_stems)
    
            # Reweight tf values, to get more classifcation words
            # and compensate for the very different document sizes 
            # available
            # Idea: Accounts for average document length, but also for
            # the number of times a word effectively occurs in a 
            # specific document; other variations can be thought of 
            # (using log) or maximal tf values
            # Note: The clustering works better with (in general)
            # smaller values
            if tf > 0.0:
                tf = 1.0 / avg_words_per_doc * tf
            # If nothing applies: tf is 0.0
                
            tfidf = tf*float(idf_dict[stem])
            tfidf_dict[stem] = tfidf

            # We may find here some threshold that makes sense
            if (tfidf > 0.0):
                stdout.write(stem + ", ")
                high_tfidf_stems.add(stem)
            
            idf_row += str(tfidf) + " "
        f.write(idf_row + "\n")
        stdout.write(")\n")
    f.close()
    print "List length of high value tf*idf terms:", len(high_tfidf_stems)
    
    sorted_tfidf_dict = \
        sorted(tfidf_dict.iteritems(), reverse=True,
               key=operator.itemgetter(1))
    
    f = open(get_stems_file(measure="_tfidf_sorted"), "w", get_def_enc())
    for pair in sorted_tfidf_dict: 
        f.write(str(pair[1]) + " " + pair[0] + "\n")
    f.close()
Example #41
0
import pymongo
from pymongo import Connection
MONGODB_PORT = 27017
import nltk
from nltk.corpus import brown
from nltk.text import TextCollection
mongodb=Connection("localhost", MONGODB_PORT)['cablegate']
browntext = TextCollection(brown.words(categories=['news','government']))
count=0
for ng in mongodb.ngrams.find(timeout=False):
	mongodb.ngrams.update({"_id":ng["_id"]},{"$set":{"tfidf": browntext.tf_idf(ng['label'],brown.words(categories=['news','government'])) }})
	count+=1
	print "updated tfidf for %d topics"%count
class WeightedTweetClassifier(TweetClassifier):
    """
    Basic idea:
    train TF-IDF model on training data
    filter out all words that we do not have clues for
    multiply all remaining term weights with the corresponding clues (+1, -1, 0), and sum the results
    """
    def __init__(self, dictfile=None, trainfile=None, datafile=None, outfile=None):
        # Call the superclass constructor
        super(WeightedTweetClassifier, self).__init__(trainfile, datafile, outfile)
        self.stemmer = PorterStemmer()

        self.trainfile = trainfile
        self.datafile = datafile
        self.outfile = outfile

        #this contains the clues we were given: {"clue":1.0, "clue2":-1.0 ... }
        self.clueValues = {}

        #the NLTK TextCollection class is used because it provides TF-IDF functionality.
        self.textCollection = None

        # read the clues
        self.readDictionary(dictfile)

        # for saving sentiment scores, so they can be meaningfully used later on by e.g. the Joint Classifier
        self.scores = {}

    def readDictionary(self, dictfile=None):
        """
        read the dictionary file. +1, -1 or 0 is saved as a sentiment for each (stemmed) term in self.clueValues

        TODO: maybe we don't want to stem, but instead use the provided POS tags? could be a separate classifier though
        """
        with open(dictfile, "r") as dictdata:
            for line in dictdata.readlines():
                fields = line.split(" ")
                token = self.stemmer.stem(fields[2].split("=")[1].strip())
                polarity = fields[5].split("=")[1].strip()
                self.clueValues[token] = (1.0 if polarity == "positive" else (-1.0 if polarity == "negative" else 0.0))

    def train(self, trainfile=None):
        print "training WeightedTweetClassifier"
        self.readTrainingData((trainfile or self.trainfile))
        for tweet in self.trainingTweets:
            # lowercase, remove punctuation
            nopunct = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation))
            tweet.tweet = nopunct
        # add all Tweets to our TextCollection. This automatically creates a TF-IDF model
        self.textCollection = TextCollection([tweet.tweet for tweet in self.trainingTweets])

    def classifyTweets(self, datafile=None, outfile=None):
        print "reading dataset"
        self.readDataset(datafile)

        print "classifying Tweets with weighted classifier"
        for tweet in self.evalTweets:
            # score = sum of TF-IDF weighted terms which carry sentiment
            tokens = string.lower(tweet.tweet.translate(string.maketrans("",""), string.punctuation)).split(" ")
            score = sum([self.textCollection.tf_idf(token, tweet.tweet) * self.clueValues.get(self.stemmer.stem(token), 0)
                         for token in tokens])
            self.scores[(tweet.id1, tweet.id2)] = score

            # Any score very close or equal to 0 is judged to be neutral.
            tweet.sentiment = ("neutral" if abs(score) < 0.01 else ( "negative" if score < 0 else "positive"))
Example #43
0
 def __init__(self, documents):
     self.__textCollection = TextCollection(documents)
Example #44
0
from __future__ import print_function
from nltk.corpus import PlaintextCorpusReader
from nltk.text import TextCollection

#load all the files in the corpus root,
#and calculate tf, idf, and tf_idf on them, and on a specific term

if __name__ == "__main__":
    corpus_root = '../data/source_data'
    corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt')

    ids = corpus.fileids()

    collection = TextCollection(corpus)

    #for x,word in enumerate(corpus.words(ids[0])[:200]):
    #    print(x,word)

    source = ids[0]
    term = corpus.words(source)[107]
    doc = corpus.words(ids[2])



    print("Source: ",source)
    print("TF of: ",term,": ",collection.tf(term,doc))
    print("IDF of: ",term,": ",collection.idf(term))
    print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))

# USE THIS SECTION FOR TESTING
# extract all words (IN TESTING)
if test:
	wfile = open('words-list.txt', 'r')
	for line in wfile:
		words.append(line.strip())
	wfile.close()

# print some more information
print '\nNumber of tweets: ' + str(len(tweets))
print 'Number of words occuring >1 time: ' + str(len(words))
print 'Number of words occuring 1 time: ' + str(len(words1))

# create .arff file for Weka
texts = TextCollection(tweets)
arff = open('tweets_sentiment.arff', "w")
wc = 0

# header
arff.write("@relation sentiment_analysis\n\n")
arff.write("@attribute numPosEmots numeric\n")
arff.write("@attribute numNegEmots numeric\n")
arff.write("@attribute numQuest numeric\n")
arff.write("@attribute numExclam numeric\n")
arff.write("@attribute numPosGaz numeric\n")
arff.write("@attribute numNegGaz numeric\n")
for word in words:
	arff.write("@attribute word_")
	sub_w = re.subn('[^a-zA-Z]', 'X', word)
	arff.write(sub_w[0])
Example #46
0
from nltk.text import TextCollection

f = open("cant.txt","r");
cont = f.read()
emails = cont.split('GROUP')
words = [email.replace('\n', ' ').split() for email in emails]
f.close()

generator = TextCollection(words)
generator.generate(150)

Example #47
0
from nltk.text import TextCollection

f = open("bible.txt","r");
cont = f.read()
emails = cont.split('BOOK OF ')
words = [email.replace('\n', ' ').split() for email in emails]
#print words
f.close()

generator = TextCollection(words)
#generator.generate(10)
#generator.generate(25)
generator.generate(1000)

Example #48
0
from nltk.text import TextCollection

f = open("cuil.txt","r");
cont = f.read()
emails = cont.split('Cuils')
words = [email.replace('\n', ' ').split() for email in emails]
f.close()

generator = TextCollection(words)
generator.generate(80)