Example #1
0
def build_text_collections():
    text_collections = {}
    sample_size = 4
    for category in ["news", "learned", "fiction"]:
        texts = []
        for fileid in nltk.corpus.brown.fileids(
                categories=category)[:sample_size]:
            texts.append(tokenize(nltk.corpus.brown.raw(fileid)))
        text_collections[category] = TextCollection(texts)
    text_collections["all"] = TextCollection(text_collections.values())
    return text_collections
Example #2
0
def compute_tfidf(text, filename):
    numPara = len(text)
    print "there should be this many para in the text file ", numPara

    colList = []
    paragraphWords = []
    for i in range(numPara):
        paragraphWords = word_tokenize(text[i])
        colList.append(paragraphWords)
    collection = TextCollection(colList)

    for paraList in colList:
        dict = {}
        for term in paraList:
            print term, "has weight: ", collection.tf_idf(term, paraList)
            dict[term] = collection.tf_idf(term, paraList)
        '''
        print "BEFORE  <><><><><<><<>><><><><><><><>><><  ",type(dict)
        for key,value in dict.iteritems():
            print key," ",value
        '''
        d = sortDict(dict)
        print "AFTER SORTED  <><><><><<><<>><><><><><><><>><><  ", type(d)
        textFile = open(filename, "a")
        textFile.write("\n")
        for key, value in d:
            s = str(key) + "\t" + str(value) + "\n"
            textFile.write(s)
Example #3
0
    def get_tf_idf_dict_nltk(
            self,
            column_type="review_body",
            save_path="tf_idf_value/hair_dryer_tf_idf_dict.csv"):
        '''
            ### nltk version
            it's super slow so don't use it
        '''
        reviews = self.raw_df[column_type].tolist()

        # get clean header
        reviews_list_cleaned = clean_tsv(reviews)

        # get all words
        words = set()
        for reviews in reviews_list_cleaned:
            for review in reviews:
                words.add(review)

        words = list(words)

        corpus = TextCollection(reviews_list_cleaned)

        tf_idf = []
        for word in words:
            tf_idf.append(corpus.tf_idf(word, corpus))

        df = pd.DataFrame({"word": words, "tf-idf": tf_idf})
        df.to_csv(save_path, encoding='utf-8')
Example #4
0
def getEmmaChapter():
    from nltk.text import TextCollection
    # from nltk.text import *
    import nltk
    # from nltk.book import text1, text2, text3
    gutenberg = TextCollection(nltk.corpus.gutenberg)

    # ----- IDF EXAMPLE ----
    # print(gutenberg.idf('Dick'))
    # ----- IDF EXAMPLE ----

    i = 2
    # line 2 to line 166 is chapter 1
    emma = nltk.corpus.gutenberg.sents('austen-emma.txt')
    # for l in emma:
    chapterText = ''
    while i < 167:
        # print(str(i) + ': ')
        k = 0
        l = emma[i]
        line = ''
        for w in l:
            line += l[k] + ' '
            k = k + 1
        # print(str(i) + ': ' + line + '\n')
        chapterText += line + '\n'
        i = i + 1

    print (chapterText)
    return
Example #5
0
def nltk_tf_idf(corpus_one, file_name):
    print('-----starting nltk_tf_idf')
    corpus_one = [nltk.word_tokenize(doc) for doc in corpus_one]
    texts = TextCollection(corpus_one)

    for doc in corpus_one:
        yield {term: texts.tf_idf(term, doc) for term in doc}
Example #6
0
def main():
    # Get text from folder or file
    # TODO Change the folder corpus to the upper level!
    texts = load_text_data(config_path)
    if not texts:
        print "No texts found"
        return
    # Dictionary that will hold all the ngrams and their values, for each measure (dict of dicts)
    scored_ngrams = {}
    # Create a list of Document objects with the texts. Pretreat them also.
    list_documents = []

    for label, text in texts.iteritems():
        list_documents.append(Document(text, stem=config_stem, name=label))

    # list_documents = TextCollection([Document(text, stem=config_stem, name=label)
    #                                  for label, text in texts.items()][:])
    list_documents = TextCollection(list_documents)
    global config_ngram
    if config_ngram == 0:
        config_ngram = 1

    #########################################N GRAM EXTRACTION #################################################

    # Now do the ngram extraction

    for ng in range(2, config_ngram + 1):
        ngrams = get_any_ngrams(list_documents, ngram=ng, k=config_top_k,
                                min_tok_len=config_min_tok_len, min_freq=config_min_tok_freq)

        scored_ngrams = update_dict_values(scored_ngrams, ngrams)

    scored_ngrams = update_dict_values(scored_ngrams, get_concordances(list_documents, scored_ngrams))
    make_tables(scored_ngrams, results_folder=config_output)
    return
Example #7
0
def ranking(reuters, corpus, docids, palavras):
    '''Cria um ranqueamento entre os textos da busca, sendo o primeiro o mais relevante

    Args:
        reuters: corpus vindo do nltk
        corpus: dicionário contendo a relação entre índice e texto
        docids: índices dos textos buscados
        palavras: palavras tokenizadas da query
    
    Returns:
        Lista com todas os índices já ranqueados
    '''
    rank = {}
    tc = TextCollection(reuters)

    for e in docids:
        rank[e] = 0
        for i in palavras:
            rank[e] += tc.tf_idf(i, corpus[e])

    rank = {
        k: v
        for k, v in reversed(sorted(rank.items(), key=lambda item: item[1]))
    }
    return rank.keys()
Example #8
0
def vectorize_t(corpus):
    #corpus = [tokenize(doc) for doc in corpus]
    texts = TextCollection(corpus)
    return {
        term: texts.tf_idf(term, corpus)
        for term in corpus
    }
Example #9
0
def sentenceAlignment(simpleParas, normalParas, pairedPara):
    for key,value in pairedPara.items(): # key is simple and value in normal
        print "**********************************"
        print "PARAGRAPH"
        print "##################################"
        
        SPara = simpleParas[key]
        NPara = normalParas[value]
        
        # given two paragraphs, it returns a list of all the sentences where each sentence is a list of words, with a list of simple sentence list and normal sentence list
        colList, sslist,nslist = formSentenceList(SPara,NPara)
        collection = TextCollection(colList)

        dict={}
        for sentence in colList:
            weight = 0
            
            for term in sentence:
                weight = collection.tf_idf(term,sentence)
                print "TERM -> ",term, "is",weight
                # what if the term is already in the dic, we need to add the weight
                if(term not in dict):
                    dict[term] = weight
                # dict[term] = weight
            
            #dict = sortDict(dict)
        print "================================================================"    

        '''
Example #10
0
def getTopic2(text):
    # clean input
    stop = open('stopwords.txt').read()
    l = []
    src = [
        w.strip(" .,?!") for w in nltk.word_tokenize(text.lower())
        if w not in stop
    ]
    candidates = nltk.FreqDist(w for w in src if len(w) > 3)
    candidates = candidates.keys()[:10]

    # initialize vectors
    brown = TextCollection(nltk.corpus.brown)
    for w in candidates:
        l.append((w, brown.tf_idf(w, candidates)))
    vectors = [array(l)]

    # initialize the clusterer
    clusterer = nltk.cluster.kmeans.KMeansClusterer(10, euclidean_distance)
    clusterer.cluster(vectors, True)

    #pick the one closest to the center of the largest
    clusterer.Means().Max()
    o = [l for l in clusterer.Means()]
    #o = [(clusterer.classify(l.index(i)), l.index(i)) for i in range(len(l))]
    o.reverse()
    print o.pop().index(1)
Example #11
0
    def __vectorize(self, corpus):
        corpus = [list(self.__tokenize(doc)) for doc in corpus]

        texts = TextCollection(corpus)

        for doc in corpus:
            yield {term: texts.tf_idf(term, doc) for term in doc}
Example #12
0
def nltk_tfidf_vectorize(corpus):
    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {term: texts.tf_idf(term, doc) for term in doc}
 def tf_idf(self):
     corpus = [
         list(self.cr.tokenize_strip_punct(desc))
         for desc in self.cr.texts()
     ]
     texts = TextCollection(corpus)
     for desc in corpus:
         yield {term: texts.tf_idf(term, desc) for term in desc}
Example #14
0
def vectorize(corpus):
    corpus_tokenized = [list(tokenize(doc)) for doc in corpus]
    texts  = TextCollection(corpus_tokenized)
    
    for doc in corpus_tokenized:
        return {
            term: texts.tf_idf(term, doc)
            for term in doc
        }
Example #15
0
def tf_idf_vectorize_nltk(corpus):
    print(corpus)
    #corpus = [tokenize(doc) for doc in corpus]
    texts  = TextCollection(corpus)
    print(texts)
    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }
Example #16
0
 def tfidf_extraction(self, subset=None):
     if subset is not None:
         data = self.data[subset]
     else:
         data = self.data
     get_idf = TextCollection(data.Tokenize.to_list())
     word_list = list(set([w for l in data.Tokenize.to_list() for w in l]))
     full_winfo = [[word, idf, tag[1]] for word, idf, tag in zip(word_list, [get_idf.idf(i) for i in word_list], nltk.pos_tag(word_list))]
     self.keywords = pd.DataFrame([i for i in full_winfo if i[2] in ["JJ", "NNP", "VBP", 'VBG', 'VBD', 'VBN', 'CD', 'NN', 'NNPS', 'RB', 'IN'] 
                                   and not is_number(i[0])], columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False)
     self.full_words = pd.DataFrame(full_winfo, columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False)
     self.enable_topk == True
 def train(self, trainfile=None):
     print "training WeightedTweetClassifier"
     self.readTrainingData((trainfile or self.trainfile))
     for tweet in self.trainingTweets:
         # lowercase, remove punctuation
         nopunct = string.lower(
             tweet.tweet.translate(string.maketrans("", ""),
                                   string.punctuation))
         tweet.tweet = nopunct
     # add all Tweets to our TextCollection. This automatically creates a TF-IDF model
     self.textCollection = TextCollection(
         [tweet.tweet for tweet in self.trainingTweets])
 def __init__(self, pairs, mode='eng', stopwords_flag=True):
     self.pair_dict = {}
     self.ids = [pair[0] for pair in pairs]
     self.tfidfs = []
     self.mode = mode
     self.stopwords_flag = stopwords_flag
     docs = [pair[1] for pair in pairs]
     self.docs = [self.preprocess(doc) for doc in docs]
     for id, text in zip(self.ids, self.docs):
         self.pair_dict[id] = text
     self.corpus = TextCollection(self.docs)
     self.query = []
Example #19
0
def run_main():
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terrible movie'

    tf_analy = TextCollection([text1, text2, text3, text4, text5])

    new_text = 'That one is a good movie. This is so good!'
    word = 'That'
    tf_idf_val = tf_analy.tf_idf(word, new_text)
    print(tf_idf_val)
    def __init__(self, doc: str, vec_size: int, alpha=0.06):
        @has_vec_set(doc)
        def get_vec_set(doc_vec):
            res = {}
            cur_line = 0
            # tc = TextCollection(self.doc)
            while 1:
                try:
                    cur_words = tf_idf_sort(doc_vec.doc, doc_vec.tc, cur_line)
                    for w, v in cur_words:
                        if w in res:
                            res[w] = max(res[w], v)
                        else:
                            res[w] = v
                except IndexError:
                    break
                cur_line += 1
                print("{} \r".format(cur_line), end='')
            return res

        self.doc = read_comments(doc)
        self.tc = TextCollection(self.doc)
        self.vec_set = get_vec_set(self)
        self.vec_set = [(w, self.vec_set[w]) for w in self.vec_set]
        self.vec_set = DataFrame(self.vec_set)
        Max = self.vec_set[1].max()
        Min = self.vec_set[1].min()
        self.vec_set[1] = self.vec_set[1].apply(lambda x: (x - Min) /
                                                (Max - Min))
        self.vec_set[1] = self.vec_set[1].apply(lambda x: x * (1 - alpha))
        self.vec_set = zip(self.vec_set[0], self.vec_set[1])
        self.vec_set = {w: v for w, v in self.vec_set}
        G = Graph(doc, True)
        tex_rank_key_word = DataFrame(key_word(G, 10, 5000))
        Min = tex_rank_key_word[1].min()
        Max = tex_rank_key_word[1].max()
        tex_rank_key_word[1] = tex_rank_key_word[1].apply(
            lambda x: alpha * (x - Min) / (Max - Min))
        tex_rank_key_word = list(
            zip(tex_rank_key_word[0], tex_rank_key_word[1]))
        self.vec_set = [(w, self.vec_set[w]) for w, v in tex_rank_key_word
                        if self.vec_set[w] >= alpha]
        # for w, v in tex_rank_key_word:
        #     if w in self.vec_set:
        #         self.vec_set[w] += v
        #     else:
        #         self.vec_set[w] = v
        # self.vec_set = sorted([(w, self.vec_set[w]) for w in self.vec_set], key=lambda x: x[1], reverse=True)
        self.vec_set = sorted(self.vec_set, key=lambda x: x[1], reverse=True)
        print(len(self.vec_set))
        self.vec_size = vec_size
Example #21
0
def Generate_keyword(obj,length):
    orig_file = './Data/'+obj+'/'+obj+'.xlsx'
    data = xlrd.open_workbook(filename=orig_file)
    sheet = data.sheet_by_index(1)
    review_head = np.array(sheet.col_values(12))[1:]
    review_body = np.array(sheet.col_values(13))[1:]
    
    review_all=[]
    for i in range(length) :
        review = review_head[i] + " " +review_body[i]
        review_all.append(review)
    review_all = np.array(review_all)
    
    # make review tokens
    tokens=[]
    for i,review in enumerate(review_all):
        review = review.lower()
        replacer = RegexpReplacer()
        review = replacer.replace(review)
        remove = str.maketrans('','',string.punctuation) 
        review = review.translate(remove)
        token = nltk.word_tokenize(review)
        token = [w for w in token if w == 'not' or 
                 not w in stopwords.words('english')] 
        s = nltk.stem.SnowballStemmer('english')  
        token = [s.stem(ws) for ws in token]
        tokens.append(token)
    token_file = './Data/'+ obj +'/tokens.pkl'
    f=open(token_file,'wb')
    pickle.dump(tokens,f)
    f.close()
    
    corpus=TextCollection(tokens) 
    
    tf={}
    tf_idf={}
    for review in tokens:
        for word in review:
            if word not in tf :
                tf_=corpus.tf(word,corpus)
                tf[word]=tf_
            if word not in tf_idf :
                tf_idf_=corpus.tf_idf(word,corpus)
                tf_idf[word] = tf_idf_
                
    tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True)
    tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1],
                           reverse=True)
    
    pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv')
    pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
Example #22
0
def computeTFIDF_text(texts,
                      singletext):  #texts是句子字符串列表(语料库),singletext单个句子字符串,
    texts = [nltk.word_tokenize(text) for text in texts]  #对句子列表所有句子分词

    corpus = TextCollection(texts)
    words = nltk.word_tokenize(singletext)  #单词列表
    tfidf_words = {}
    #计算机tfidf
    for word in words:
        idf = corpus.idf(word)  #tf
        tf = corpus.tf(word, words)  #idf
        tfidf = idf * tf
        tfidf_words[word] = tfidf
    return tfidf_words
Example #23
0
def calculate_idf(words, corpus):
    """
    Calulate the idf of words by using a corpus
    :param words:  The words to calculate their idf
    :param corpus: The corpus to use in calculation
    :return:       dict of {word: idf}
    """
    words = set(words)
    # print("Loading corpus to calculate idf...")
    corpus_colleciton = TextCollection(corpus)
    idfs = {}
    for word in words:
        idfs[word] = corpus_colleciton.idf(word)
    return idfs
Example #24
0
def getTextCollectionFromTxtFile(fn):
    '''
    Create text collection from external text files
    Input:
        fn - name of the external text file
    Output:
        textCollection containing all texts in the given file
    '''
    f = open(fn, 'rU')
    tc = []
    alltokens = []
    for line in f:
        text, tokens = getTextFromString(line)
        tc.append(text)
        alltokens.extend(tokens)
    return TextCollection(tc), alltokens
Example #25
0
def attrexplore(corpus):
    # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.."
    # ss = SenToken(raw=s)
    # print(ss)
    # for sent in ss:
    #     print(sent)

    nltkCorpus = TextCollection(corpus)
    print(nltkCorpus.idf(term='this'))

    print(idf(term='this', corpus=corpus))

    print(nltkCorpus.tf(term='this', text='this is sentence four'))
    print(tf_idf(term='this', doc='this is sentence four', corpus=corpus))
    fdist = nltk.FreqDist(WordTokener(sent=corpus[0]))
    print(fdist.tabulate())
Example #26
0
def compute_features(blog):
    features = []
    text_collection = []
    for doc in blog.docs:
        text_collection.append(' '.join(doc))
    text_collection = TextCollection(text_collection)  # 为了方便计算tf_idf
    pageranks = PageRank(blog)  # 得到每句话的pagerank分数
    for i, doc in enumerate(blog.docs):
        for j, sent in enumerate(doc):
            cur_feat = []
            cur_feat.extend(surface(blog, i, j, sent))
            cur_feat.extend(content(blog, i, j, sent, text_collection))
            cur_feat.extend(rel(blog, i, j, sent))
            cur_feat.append(pageranks[i][j])
            features.append(cur_feat)
    features = normalize(features)
    return features
Example #27
0
def text_classification():
    """
    文本分类
    :return:
    """
    text1 = 'I like the movie so much '
    text2 = 'That is a good movie '
    text3 = 'This is a great one '
    text4 = 'That is a really bad movie '
    text5 = 'This is a terrible movie'

    # 构建TextCollection对象
    tc = TextCollection([text1, text2, text3, text4, text5])
    new_text = 'That one is a good movie. This is so good!'
    word = 'That'
    tf_idf_val = tc.tf_idf(word, new_text)
    print('{}的TF-IDF值为:{}'.format(word, tf_idf_val))
Example #28
0
def retrieve_results(n_percentile):
    search_queries = parse_trec('documents/irg_queries.trec')
    search_collections = parse_trec('documents/irg_collection_clean.trec')
    # search_collections = parse_trec('documents/irg_collection_short.trec')
    # search_collections = eliminate_stopwords(search_collections)
    # write_collection_doc(search_collections, 'documents/irg_collection_clean.trec')

    print('======= Statistics =======')
    print(f'Queries: {len(search_queries)}')
    print(f'Collections: {len(search_collections)}')
    print(f'Removal of {int((1-n_percentile)*100)}%-ile')
    print('==========================')

    # TF-IDF
    document_results = []
    for search_query_id, search_query_text in search_queries.items():
        print(
            f'Current query id: {search_query_id}, text: "{search_query_text}"'
        )
        terms = search_query_text.split(' ')
        documents = keep_n_percentile_most_relevant_words(search_collections,
                                                          search_query_text,
                                                          n=n_percentile)
        document_scores = {}
        search_texts_collection = TextCollection(documents.values())
        for document_id, document_text in documents.items():
            for term in terms:
                current_score = document_scores.get(document_id, 0.0)
                document_scores[
                    document_id] = current_score + search_texts_collection.tf_idf(
                        term, document_text)

        rank = 1
        for document_id, document_scores in sorted(document_scores.items(),
                                                   key=lambda kv: kv[1],
                                                   reverse=True):
            if rank <= 1000:
                document_results.append(
                    Result(search_query_id, document_id, rank,
                           document_scores))
                rank += 1

    result_writer(document_results,
                  f'IE_result_keep_{int(n_percentile*100)}_percentile.trec')
    print('Done')
Example #29
0
def calc_tf_idfs(count):
    """loops through archived wordlists, loads each, calculates TF-IDF score 
	for words contained, writes to dict and saves in pickle.
	"""
    corpus = TextCollection(nltk.corpus.webtext)

    filepath = '/home/jrwalk/python/empath/data/reddit/pickles/'
    files = glob.glob(filepath + 'wordcount*%s.pkl' % count)
    filecount = len(files)
    for i, picklefile in enumerate(files):
        print "%i/%i processing %s" % (i + 1, filecount, picklefile)
        with open(picklefile, 'r') as readfile:
            freqdist = pickle.load(readfile)[2]
        wordscores = tf_idf(freqdist, corpus)
        druglim = re.findall('[a-z]+_[0-9]+|all|antidepressant', picklefile)[0]
        writepath = filepath + 'tfidf_' + druglim + '.pkl'
        with open(writepath, 'w') as writefile:
            pickle.dump(wordscores, writefile)
    def preprocess(self, text):
        #text = text.split(" ");
        text = word_tokenize(text)
        if self.display:
            print "After Tokenizing"
            print text
            print "\n\n"

        text = [
            w.strip().lower() for w in text
            if not w.strip() in ENGLISH_STOPWORDS and len(w.strip()) > 2
        ]

        tc = TextCollection([text])
        words = list(set(tc))

        word_tf = {word: tc.tf(word, text) * len(text) for word in words}

        return word_tf