Ejemplo n.º 1
0
	def getDomainUnigram(self, directory = None):		
		collocations = set()  #collocation items
		ewordlists = list() #list of lists of words
		
		#extract words from essays
		if directory is not None:
			doclist = os.listdir(directory)
			for essay in doclist:
				dir_essay  = directory+'/'+essay
				etext = open(dir_essay,'r').read()
				tokens = nltk.wordpunct_tokenize(etext)
				tokens = [word.lower() for word in tokens]
				#stemming
				if self._stemoption ==True:
					st = PorterStemmer()
					tokens = [st.stem(t) for t in tokens]
				
				#extract the collocation for the given essay
				e_bigram = set(Mytext(tokens).collocations())
				collocations = collocations | e_bigram
				ewordlists.append(tokens)
				
		else: # using the mapped essay to calcuate the candidate bigrams
			#need to call mapessay fuction first
			for ins in self._data:
				if ins['essay'] is not None:
					etext = open(ins['essay'],'r').read()
					tokens = nltk.wordpunct_tokenize(etext)
					tokens = [word.lower() for word in tokens]
					#stemming
					if self._stemoption ==True:
						st = PorterStemmer()
						tokens = [st.stem(t) for t in tokens]
				
					#extract the collocation for the given essay
					e_bigram = set(Mytext(tokens).collocations())
					collocations = collocations | e_bigram
					ewordlists.append(tokens)
		
		#get collection of all essays under the specified directory / associated essays
		collection_text = TextCollection(ewordlists)
		
		itemlist = list()
		for (a, b) in collocations:
			itemlist.append(a)
			itemlist.append(b)
			
		itemlist = list(set(itemlist))	
		
		word_idf = []
		for i in range(len(itemlist)):
			word_idf.append((collection_text.idf(itemlist[i]), itemlist[i]))	
		
		word_idf = sorted(word_idf, key = operator.itemgetter(0))
		ave = 0
		if len(word_idf)!=0:
			ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf)
			
		wlist =  [j for (i, j) in word_idf if i<ave]				
		return wlist
Ejemplo n.º 2
0
class TextIndexer:
    
    __textCollection = None
    
    def __init__(self, documents):
        self.__textCollection = TextCollection(documents)
        
    def idf(self, term):
        return self.__textCollection.idf(term)
    
    def tf(self, term, text):
        return self.__textCollection.tf(term, text)
Ejemplo n.º 3
0
 def tfidf_extraction(self, subset=None):
     if subset is not None:
         data = self.data[subset]
     else:
         data = self.data
     get_idf = TextCollection(data.Tokenize.to_list())
     word_list = list(set([w for l in data.Tokenize.to_list() for w in l]))
     full_winfo = [[word, idf, tag[1]] for word, idf, tag in zip(word_list, [get_idf.idf(i) for i in word_list], nltk.pos_tag(word_list))]
     self.keywords = pd.DataFrame([i for i in full_winfo if i[2] in ["JJ", "NNP", "VBP", 'VBG', 'VBD', 'VBN', 'CD', 'NN', 'NNPS', 'RB', 'IN'] 
                                   and not is_number(i[0])], columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False)
     self.full_words = pd.DataFrame(full_winfo, columns=["word", "idf", "tag"]).sort_values(by="idf", ascending=True).reset_index(drop=False)
     self.enable_topk == True
Ejemplo n.º 4
0
def computeTFIDF_text(texts,
                      singletext):  #texts是句子字符串列表(语料库),singletext单个句子字符串,
    texts = [nltk.word_tokenize(text) for text in texts]  #对句子列表所有句子分词

    corpus = TextCollection(texts)
    words = nltk.word_tokenize(singletext)  #单词列表
    tfidf_words = {}
    #计算机tfidf
    for word in words:
        idf = corpus.idf(word)  #tf
        tf = corpus.tf(word, words)  #idf
        tfidf = idf * tf
        tfidf_words[word] = tfidf
    return tfidf_words
Ejemplo n.º 5
0
def calculate_idf(words, corpus):
    """
    Calulate the idf of words by using a corpus
    :param words:  The words to calculate their idf
    :param corpus: The corpus to use in calculation
    :return:       dict of {word: idf}
    """
    words = set(words)
    # print("Loading corpus to calculate idf...")
    corpus_colleciton = TextCollection(corpus)
    idfs = {}
    for word in words:
        idfs[word] = corpus_colleciton.idf(word)
    return idfs
Ejemplo n.º 6
0
def IDF(qi, t):
    if t == 'all':
        corpus = TextCollection(body_split)
    elif t == 'ques':
        corpus = TextCollection(Q_body_split)
    elif t == 'ans':
        corpus = TextCollection(A_body_split)
    elif t == 'btitle':
        corpus = TextCollection(B_title_split)
    elif t == 'bsubt':
        corpus = TextCollection(B_subt_split)
    elif t == 'bsummary':
        corpus = TextCollection(B_summary_split)

    idf = corpus.idf(qi)
    return idf
Ejemplo n.º 7
0
def attrexplore(corpus):
    # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.."
    # ss = SenToken(raw=s)
    # print(ss)
    # for sent in ss:
    #     print(sent)

    nltkCorpus = TextCollection(corpus)
    print(nltkCorpus.idf(term='this'))

    print(idf(term='this', corpus=corpus))

    print(nltkCorpus.tf(term='this', text='this is sentence four'))
    print(tf_idf(term='this', doc='this is sentence four', corpus=corpus))
    fdist = nltk.FreqDist(WordTokener(sent=corpus[0]))
    print(fdist.tabulate())
Ejemplo n.º 8
0
def splitter(fileName):

    with open(fileName,'r') as f:
        d = json.load(f)
    #assuming the file is evenly divisable by 10
    for i in range(3):
        shuffle(d)
    d = d[:1000]
    corpus = []
    corpusList = []
    classifiers = []

    for i in range(len(d)):
        d[i]['tAbstract'] = tknize(d[i]['abstract'])
        corpus.extend(d[i]['tAbstract'])
        corpusList.extend(d[i]['tAbstract'])

        if d[i]['type'] not in classifiers:
            classifiers.append(d[i]['type'])

    # initialize numpy array
    for i in range(len(d)):
        d[i]['vector'] = numpy.empty([len(corpusList)])

    tc = TC(corpus)
    print("Starting vector calculation")
    for doc in d:
        place = 0
        for word in corpusList:
            idf = tc.idf(word)
            tf = tc.tf(word, doc['tAbstract'])

            # create a vector that is guaranteed to be in the same order for each doc, as
            # each doc appends the tf-idf score of the word to its vector at the same time
            doc['vector'][place] = idf * tf
            place += 1

    return d, classifiers
Ejemplo n.º 9
0
 def bag_of_word(self):
     sub_cl = TextCollection(self.data["Tokenize"].to_list())
     sub_collection = list(set([word for text in self.data["Tokenize"].to_list() for word in text]))
     sub_dict = {}
     print("\n>> Extracting Bag-of-word Vector with TF-IDF...")
     for i in tqdm(sub_collection):
         sub_dict[i] = sub_cl.idf(i)
     sub_pos = {}
     index = 0
     for i in tqdm(sub_collection):
         sub_pos[i] = index
         index+=1
     sub_len = len(sub_collection)
     def d2v(word_list):
         v = np.zeros(sub_len)
         for i in word_list:
             v[sub_pos[i]] = sub_dict[i]
         return v
     tqdm.pandas(desc="Processing")
     self.sub_pos = sub_pos
     BaggingVector = self.data["Tokenize"].progress_apply(lambda x: d2v(x))
     print("\n>> Extracting Finished...")
     return BaggingVector, sub_pos
Ejemplo n.º 10
0
 def calculate_IDFs(self, courses):
     ''' This function calculates inverse term frequencies for each word in courses
     :param courses: Structure containing courses with title and description
     :return: IDFs A Dictionary containing all words as keys with their inverse document frequencies as values
     '''
     mytexts = []
     IDF_scores = {}
     for course in courses:
         # Title und Text zu einem String zusammenfassen, dabei Titel gewichten
         text = course.title + " " + course.description
         text = self.preprocess(text)
         # initialize Dictionary with all accuring words
         text_without_stopwords = ""
         mystopwords = stopwords.words("english") + stopwords.words(
             "german")
         for word in nltk.word_tokenize(text):
             if word not in mystopwords:
                 IDF_scores[word] = 0.0
                 text_without_stopwords = text_without_stopwords + " %s" % word
         mytexts.append(text_without_stopwords)
     myTextCollection = TextCollection(mytexts)
     for word in IDF_scores:
         IDF_scores[word] = myTextCollection.idf(word)
     return IDF_scores
Ejemplo n.º 11
0
chong_1 = [word_tokenize(sent) for sent in sents_1]
corpus_1 = TextCollection(chong_1)
# print("23333333333333333333333333333333333333333333333########################")
# print(corpus_1)
# print("_________________________")

#计算语料库中"one"的tf值
for a in filtered_words:

    tf = corpus_1.tf(a, corpus_1)  # 1/12
    #print(a,"tf:",tf)
    #print(type(a))

    #计算语料库中"one"的idf值iii

    idf = corpus.idf(a)  #log(3/1)
    #print(a,"idf:",idf)

    tf_idf = tf * idf

    d = dict.fromkeys([a], tf_idf)
    #print(d)
    Word_dict.update(d)

#print("_________________________")
#print(Word_dict)
# from_high_to_low=sorted(Word_dict.items(), key=lambda d:d[1], reverse = False )
from_high_to_low = Counter(Word_dict).most_common()  #返回一个列表,按照dict的value从大到小排序

#print(from_high_to_low)
for i in from_high_to_low:
Ejemplo n.º 12
0
    " words by edit distance " + \
    str(editdistance) + "; this may take a while!"
xmlcollection.get_words_by_editdistance(editdistance=editdistance,
                                        no_of_most_freq=no_of_topwords)

# Write the found sets to disk; also write most frequent words to disk.
xmlcollection.write_words_by_editdistance(editdistance=editdistance)
xmlcollection.write_topwords(no_of_words=no_of_topwords)
print "Top words written to disk."

# XXX: BIG F**K UP ################################## FIX FIX FIX #####

# Print idf, tf and tf-idf values for the term "CCC", in document
# no. 42 - for testing.
nltk_textcollection = TextCollection(xmlcollection.get_words())
print "idf: " + str(nltk_textcollection.idf("CCC"))
print "tf: " + str(nltk_textcollection.tf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))
print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))

# Do that now systematically for all documents
print "Document where tf is bigger 0:"
cnt = 0
for doc in xmlcollection.get_docs():
    tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens()))
    stdout.write(str(tf) + ", ")
    cnt += 1
    if cnt == 10: 
        print
    cnt = 0
Ejemplo n.º 13
0
# 首先,构建语料库corpus
sents = [
    'this is sentence one', 'this is sentence two', 'this is sentence three'
]
sents = [word_tokenize(sent) for sent in sents]  # 对每个句子进行分词
print(sents)  # 输出分词后的结果
corpus = TextCollection(sents)  # 构建语料库
print(corpus)  # 输出语料库

# 计算语料库中"one"的tf值
tf = corpus.tf('one', corpus)  # 1/12
print(tf)

# 计算语料库中"one"的idf值
idf = corpus.idf('one')  # log(3/1)
print(idf)

# 计算语料库中"one"的tf-idf值
tf_idf = corpus.tf_idf('one', corpus)
print(tf_idf)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

x_train = [
    'TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景',
    '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要'
]
x_test = ['原始 文本 进行 标记', '主要 思想']
Ejemplo n.º 14
0
from __future__ import print_function
from nltk.corpus import PlaintextCorpusReader
from nltk.text import TextCollection

#load all the files in the corpus root,
#and calculate tf, idf, and tf_idf on them, and on a specific term

if __name__ == "__main__":
    corpus_root = '../data/source_data'
    corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt')

    ids = corpus.fileids()

    collection = TextCollection(corpus)

    #for x,word in enumerate(corpus.words(ids[0])[:200]):
    #    print(x,word)

    source = ids[0]
    term = corpus.words(source)[107]
    doc = corpus.words(ids[2])



    print("Source: ",source)
    print("TF of: ",term,": ",collection.tf(term,doc))
    print("IDF of: ",term,": ",collection.idf(term))
    print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))

Ejemplo n.º 15
0
class CitationSearch:
    def __init__(self, pairs, mode='eng', stopwords_flag=True):
        self.pair_dict = {}
        self.ids = [pair[0] for pair in pairs]
        self.tfidfs = []
        self.mode = mode
        self.stopwords_flag = stopwords_flag
        docs = [pair[1] for pair in pairs]
        self.docs = [self.preprocess(doc) for doc in docs]
        for id, text in zip(self.ids, self.docs):
            self.pair_dict[id] = text
        self.corpus = TextCollection(self.docs)
        self.query = []

    def preprocess(self, raw):
        if self.mode == 'eng':
            return self.preprocess_eng(raw)
        if self.mode == 'ru':
            return self.preprocess_ru(raw)
        if self.mode == 'eng+ru':
            return self.preprocess_eng(raw) + self.preprocess_ru(raw)

    def preprocess_eng(self, raw):
        doc = []
        text = re.findall(words_eng, raw)
        for token in text:
            token = token.lower()
            if self.stopwords_flag:
                if token not in stopwords_eng:
                    doc.append(stemmer.stem(token))
            else:
                doc.append(stemmer.stem(token))
        return doc

    def preprocess_ru(self, raw):
        doc = []
        text = re.findall(words_ru, raw)
        for token in text:
            if self.stopwords_flag:
                if token not in stopwords_ru:
                    doc.append(morph.parse(token)[0].normal_form)
            else:
                doc.append(morph.parse(token)[0].normal_form)
        return doc

    def get_tf(self, term, document):
        return self.corpus.tf(term, document)

    def get_idf(self, term):
        return self.corpus.idf(term)

    @staticmethod
    def normalize_cosine(doc, doc_vecs):
        counter = Counter(doc)
        cosine_norm = np.sqrt(np.sum(
            np.array(list(dict(counter).values()))**2))
        doc_vector = np.array(doc_vecs) / cosine_norm
        return doc_vector

    def tfidf_docs(self):
        doc_vectors = []
        for doc in self.docs:
            doc_tfs = {}
            for term in doc:
                doc_tfs[term] = self.get_tf(term, doc) * self.get_idf(term)
            doc_vector = self.normalize_cosine(doc, list(doc_tfs.values()))
            doc_tfidfs = {}
            for term, vec in zip(doc_tfs, doc_vector):
                doc_tfidfs[term] = vec
            doc_vectors.append(doc_tfidfs)
        self.tfidfs = doc_vectors

    def tfidf_queries(self, query):
        self.query = self.preprocess(query)
        query_tfsidfs = {}
        for term in self.query:
            query_tfsidfs[term] = self.get_tf(term,
                                              self.query) * self.get_idf(term)
        return query_tfsidfs

    def query_relevance(self, query):
        tfidf = self.tfidf_queries(query)
        query_vec = list(tfidf.values())
        doc_vecs = []
        for doc in self.tfidfs:
            doc_vec = []
            for term_query in tfidf:
                if term_query in doc:
                    doc_vec.append(doc[term_query])
                else:
                    doc_vec.append(0)
            doc_vecs.append(doc_vec)
        cosines = []
        for vec in doc_vecs:
            if np.any(vec):
                cosines.append(1 - cosine(vec, query_vec))
            else:
                cosines.append(0)
        relevance_ids = [
            text_id for _, text_id in sorted(
                zip(cosines, self.ids), key=(lambda x: x[0]), reverse=True)
        ]
        cosines.sort(reverse=True)
        most_relevant = relevance_ids[0]
        relevant_candidates = [relevance_ids[0]]
        for cos in range(1, len(cosines)):
            if cosines[0] - cosines[cos] <= 0.000001:
                relevant_candidates.append(relevance_ids[cos])
        if len(relevant_candidates) > 1:
            tiebreaker = []
            for id in relevant_candidates:
                rel_text = self.pair_dict[id]
                absent_words = 0
                for word in rel_text:
                    if word not in self.query:
                        absent_words += 1
                tiebreaker.append(absent_words)
            relevant_candidates = [
                text_id
                for _, text_id in sorted(zip(tiebreaker, relevant_candidates),
                                         key=(lambda x: x[0]))
            ]
            most_relevant = relevant_candidates[0]
        return most_relevant, cosines[0]
Ejemplo n.º 16
0
#     return uniqueResult
########################################################################################################################

######### Test with topN words from allover the corpus##################################################################
# all_words = nltk.FreqDist(w for w in dialect.words())
# topN = all_words.most_common(4000)
########################################################################################################################

########## Test with getting topN with highest idf #####################################################################
uniqueWords = []
for i in text:
    if not i in uniqueWords:
        uniqueWords.append(i)
print('unique words count', len(uniqueWords))

wordsIDF = [(word, x.idf(word)) for word in uniqueWords]
#print('finished getting idf for all unique words' , wordsIDF)

sortedidfs = sorted(wordsIDF, key=itemgetter(1), reverse=True)

#print('sortedidfs' , sortedidfs)
print('length of words with idfs', len(sortedidfs))

eliminateBiggerThanOne = []
for i in sortedidfs:
    if i[1] < 1:
        eliminateBiggerThanOne.append(i)
print('eliminated bigger than one', eliminateBiggerThanOne)
print('length of elimnation', len(eliminateBiggerThanOne))

topN = []
#         if not i in uniqueResult:
#             uniqueResult.append(i)
#     return uniqueResult
#
# topN = [i[0] for i in topNwords()]
########################################################################################################################


########## Test with getting topN with highest idf #####################################################################
uniqueWords =[]
for i in text:
    if not i in uniqueWords:
        uniqueWords.append(i)
print('unique words count', len(uniqueWords))

wordsIDF = [(word,x.idf(word))
            for word in uniqueWords]
#print('finished getting idf for all unique words' , wordsIDF)

sortedidfs = sorted(wordsIDF,key=itemgetter(1), reverse = True)

#print('sortedidfs' , sortedidfs)
print('length of words with idfs' , len(sortedidfs))

eliminateBiggerThanOne = []
for i in sortedidfs:
    if i[1] < 1:
        eliminateBiggerThanOne.append(i)
print('eliminated bigger than one' , eliminateBiggerThanOne)
print('length of elimnation' , len(eliminateBiggerThanOne))
Ejemplo n.º 18
0
from nltk.text import TextCollection

mytext = TextCollection([
    'The cat hit that dog'.lower(), 'This is a fat dog'.lower(),
    'I like this dog', 'I like to go to school'
])
print(mytext.idf('this'))