Ejemplo n.º 1
0
class TextIndexer:
    
    __textCollection = None
    
    def __init__(self, documents):
        self.__textCollection = TextCollection(documents)
        
    def idf(self, term):
        return self.__textCollection.idf(term)
    
    def tf(self, term, text):
        return self.__textCollection.tf(term, text)
Ejemplo n.º 2
0
def Generate_keyword(obj,length):
    orig_file = './Data/'+obj+'/'+obj+'.xlsx'
    data = xlrd.open_workbook(filename=orig_file)
    sheet = data.sheet_by_index(1)
    review_head = np.array(sheet.col_values(12))[1:]
    review_body = np.array(sheet.col_values(13))[1:]
    
    review_all=[]
    for i in range(length) :
        review = review_head[i] + " " +review_body[i]
        review_all.append(review)
    review_all = np.array(review_all)
    
    # make review tokens
    tokens=[]
    for i,review in enumerate(review_all):
        review = review.lower()
        replacer = RegexpReplacer()
        review = replacer.replace(review)
        remove = str.maketrans('','',string.punctuation) 
        review = review.translate(remove)
        token = nltk.word_tokenize(review)
        token = [w for w in token if w == 'not' or 
                 not w in stopwords.words('english')] 
        s = nltk.stem.SnowballStemmer('english')  
        token = [s.stem(ws) for ws in token]
        tokens.append(token)
    token_file = './Data/'+ obj +'/tokens.pkl'
    f=open(token_file,'wb')
    pickle.dump(tokens,f)
    f.close()
    
    corpus=TextCollection(tokens) 
    
    tf={}
    tf_idf={}
    for review in tokens:
        for word in review:
            if word not in tf :
                tf_=corpus.tf(word,corpus)
                tf[word]=tf_
            if word not in tf_idf :
                tf_idf_=corpus.tf_idf(word,corpus)
                tf_idf[word] = tf_idf_
                
    tf_sorted = sorted(tf.items(), key=lambda item:item[1], reverse=True)
    tf_idf_sorted = sorted(tf_idf.items(), key=lambda item:item[1],
                           reverse=True)
    
    pd.DataFrame(tf_sorted).to_csv('./Data/'+obj+'/tf_sorted.csv')
    pd.DataFrame(tf_idf_sorted).to_csv('./Data/'+obj+'/tf_idf_sorted.csv')
Ejemplo n.º 3
0
def computeTFIDF_text(texts,
                      singletext):  #texts是句子字符串列表(语料库),singletext单个句子字符串,
    texts = [nltk.word_tokenize(text) for text in texts]  #对句子列表所有句子分词

    corpus = TextCollection(texts)
    words = nltk.word_tokenize(singletext)  #单词列表
    tfidf_words = {}
    #计算机tfidf
    for word in words:
        idf = corpus.idf(word)  #tf
        tf = corpus.tf(word, words)  #idf
        tfidf = idf * tf
        tfidf_words[word] = tfidf
    return tfidf_words
Ejemplo n.º 4
0
    def preprocess(self,text):
        #text = text.split(" ");
        text = word_tokenize(text)
        if self.display:
            print "After Tokenizing"
            print text
            print "\n\n"

        text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2]
        
        tc = TextCollection([text])
        words = list(set(tc))
        
        word_tf = {word: tc.tf(word, text) * len(text) for word in words}

        return word_tf
Ejemplo n.º 5
0
def attrexplore(corpus):
    # s = "in douglas r. stinson, editor,.proc. crypto 93,.lecture notes in computer science no. 773..pages 278-291..1994..avrim blum, merrick furst, michael kearns, and richard j. lipton..springer,.cryptographic primitives based on hard learning problems.."
    # ss = SenToken(raw=s)
    # print(ss)
    # for sent in ss:
    #     print(sent)

    nltkCorpus = TextCollection(corpus)
    print(nltkCorpus.idf(term='this'))

    print(idf(term='this', corpus=corpus))

    print(nltkCorpus.tf(term='this', text='this is sentence four'))
    print(tf_idf(term='this', doc='this is sentence four', corpus=corpus))
    fdist = nltk.FreqDist(WordTokener(sent=corpus[0]))
    print(fdist.tabulate())
Ejemplo n.º 6
0
    def preprocess(self, text):
        #text = text.split(" ");
        text = word_tokenize(text)
        if self.display:
            print "After Tokenizing"
            print text
            print "\n\n"

        text = [
            w.strip().lower() for w in text
            if not w.strip() in ENGLISH_STOPWORDS and len(w.strip()) > 2
        ]

        tc = TextCollection([text])
        words = list(set(tc))

        word_tf = {word: tc.tf(word, text) * len(text) for word in words}

        return word_tf
Ejemplo n.º 7
0
def compute_tf_idf_similarity(query: str, content: str, type: str) -> float:
    """
    Compute the mean tf-idf or tf
     similarity for one sentence with multi query words.
    :param query: a string contain all key word split by one space
    :param content: string list with every content relevent to this query.
    :return: average tf-idf or tf similarity.
    """
    sents = [word_tokenize(content),
             word_tokenize("")]  # add one empty file to smooth.
    corpus = TextCollection(sents)  # 构建语料库

    result_list = []
    for key_word in query.strip(" ").split(" "):
        if type == "tf_idf":
            result_list.append(corpus.tf_idf(key_word, corpus))
        elif type == "tf":
            result_list.append(corpus.tf(key_word, corpus))
        else:
            raise KeyError

    return sum(result_list) / len(result_list)
Ejemplo n.º 8
0
def splitter(fileName):

    with open(fileName,'r') as f:
        d = json.load(f)
    #assuming the file is evenly divisable by 10
    for i in range(3):
        shuffle(d)
    d = d[:1000]
    corpus = []
    corpusList = []
    classifiers = []

    for i in range(len(d)):
        d[i]['tAbstract'] = tknize(d[i]['abstract'])
        corpus.extend(d[i]['tAbstract'])
        corpusList.extend(d[i]['tAbstract'])

        if d[i]['type'] not in classifiers:
            classifiers.append(d[i]['type'])

    # initialize numpy array
    for i in range(len(d)):
        d[i]['vector'] = numpy.empty([len(corpusList)])

    tc = TC(corpus)
    print("Starting vector calculation")
    for doc in d:
        place = 0
        for word in corpusList:
            idf = tc.idf(word)
            tf = tc.tf(word, doc['tAbstract'])

            # create a vector that is guaranteed to be in the same order for each doc, as
            # each doc appends the tf-idf score of the word to its vector at the same time
            doc['vector'][place] = idf * tf
            place += 1

    return d, classifiers
Ejemplo n.º 9
0
# tokens =nltk.word_tokenize(f2)
corpus = TextCollection(chong)  #构建语料库
#print("_______________@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
# print(corpus) #输出语料库

sents_1 = sent_tokenize(f1)
chong_1 = [word_tokenize(sent) for sent in sents_1]
corpus_1 = TextCollection(chong_1)
# print("23333333333333333333333333333333333333333333333########################")
# print(corpus_1)
# print("_________________________")

#计算语料库中"one"的tf值
for a in filtered_words:

    tf = corpus_1.tf(a, corpus_1)  # 1/12
    #print(a,"tf:",tf)
    #print(type(a))

    #计算语料库中"one"的idf值iii

    idf = corpus.idf(a)  #log(3/1)
    #print(a,"idf:",idf)

    tf_idf = tf * idf

    d = dict.fromkeys([a], tf_idf)
    #print(d)
    Word_dict.update(d)

#print("_________________________")
Ejemplo n.º 10
0
    print(len(features))

from nltk.text import TextCollection
from nltk.tokenize import word_tokenize

# 首先,构建语料库corpus
sents = [
    'this is sentence one', 'this is sentence two', 'this is sentence three'
]
sents = [word_tokenize(sent) for sent in sents]  # 对每个句子进行分词
print(sents)  # 输出分词后的结果
corpus = TextCollection(sents)  # 构建语料库
print(corpus)  # 输出语料库

# 计算语料库中"one"的tf值
tf = corpus.tf('one', corpus)  # 1/12
print(tf)

# 计算语料库中"one"的idf值
idf = corpus.idf('one')  # log(3/1)
print(idf)

# 计算语料库中"one"的tf-idf值
tf_idf = corpus.tf_idf('one', corpus)
print(tf_idf)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

x_train = [
    'TF-IDF 主要 思想 是', '算法 一个 重要 特点 可以 脱离 语料库 背景',
Ejemplo n.º 11
0
    def processSimilarity(self):
        # Similarity analysis
        # Find all synonyms of all words
        wordsGroup = []
        if not os.path.exists('featureGroups.txt'):
            marker = 0
            matches = []
            group = []
            for w in self.CandidateTerms:
                if w != "" and not (type(w) is list):
                    for word in w.split():
                        for synset in wn.synsets(word):
                            for synonym in synset.lemmas():
                                matches = [
                                    term for term in self.CandidateTerms
                                    if not (type(term) is list) and (
                                        synonym.name() in term.split()
                                        and term not in matches)
                                ]
                                matches = self.RemoveDuplicates(matches)
                                if len(matches) > 0:
                                    # Constuct words group
                                    group.extend(matches)
                        if len(group) > 0:
                            wordsGroup.append(
                                ('FG' + str(marker), copy.deepcopy(group)))

                            for val in group:
                                with open("featureGroups.txt", "a") as fg:
                                    fg.write('FG' + str(marker) + ' -> ' +
                                             val + '\n')

                            group = []
                            marker = marker + 1
        else:
            added = False
            with open("featureGroups.txt", "r") as fg:
                grouptext = fg.read()
            lines = grouptext.split("\n")
            for line in lines:
                arr = line.split("->")
                for (m, v) in wordsGroup:
                    if m == arr[0]:
                        v.append(arr[1])
                        added = True
                        break
                if not added:
                    if len(arr) > 1:
                        wordsGroup.append((arr[0], [arr[1]]))

                added = False

        textcombine = ' '
        for i, s, t in self.tokenized_sentence_array:
            textcombine = (textcombine + ''.join(s))
        corpuscol = TextCollection([textcombine])
        for g in wordsGroup:
            for w in g:
                cnt = 0
                weight = 0.0
                for t in w:
                    weight = weight + corpuscol.tf(t, textcombine)
                    cnt = cnt + 1
                self.WeightedCandidateTerm.append((w, weight / cnt))
                with open("weightedGroups.txt", "a") as wg:
                    wg.write(str(self.WeightedCandidateTerm).strip('[]'))
        return wordsGroup
Ejemplo n.º 12
0
class CitationSearch:
    def __init__(self, pairs, mode='eng', stopwords_flag=True):
        self.pair_dict = {}
        self.ids = [pair[0] for pair in pairs]
        self.tfidfs = []
        self.mode = mode
        self.stopwords_flag = stopwords_flag
        docs = [pair[1] for pair in pairs]
        self.docs = [self.preprocess(doc) for doc in docs]
        for id, text in zip(self.ids, self.docs):
            self.pair_dict[id] = text
        self.corpus = TextCollection(self.docs)
        self.query = []

    def preprocess(self, raw):
        if self.mode == 'eng':
            return self.preprocess_eng(raw)
        if self.mode == 'ru':
            return self.preprocess_ru(raw)
        if self.mode == 'eng+ru':
            return self.preprocess_eng(raw) + self.preprocess_ru(raw)

    def preprocess_eng(self, raw):
        doc = []
        text = re.findall(words_eng, raw)
        for token in text:
            token = token.lower()
            if self.stopwords_flag:
                if token not in stopwords_eng:
                    doc.append(stemmer.stem(token))
            else:
                doc.append(stemmer.stem(token))
        return doc

    def preprocess_ru(self, raw):
        doc = []
        text = re.findall(words_ru, raw)
        for token in text:
            if self.stopwords_flag:
                if token not in stopwords_ru:
                    doc.append(morph.parse(token)[0].normal_form)
            else:
                doc.append(morph.parse(token)[0].normal_form)
        return doc

    def get_tf(self, term, document):
        return self.corpus.tf(term, document)

    def get_idf(self, term):
        return self.corpus.idf(term)

    @staticmethod
    def normalize_cosine(doc, doc_vecs):
        counter = Counter(doc)
        cosine_norm = np.sqrt(np.sum(
            np.array(list(dict(counter).values()))**2))
        doc_vector = np.array(doc_vecs) / cosine_norm
        return doc_vector

    def tfidf_docs(self):
        doc_vectors = []
        for doc in self.docs:
            doc_tfs = {}
            for term in doc:
                doc_tfs[term] = self.get_tf(term, doc) * self.get_idf(term)
            doc_vector = self.normalize_cosine(doc, list(doc_tfs.values()))
            doc_tfidfs = {}
            for term, vec in zip(doc_tfs, doc_vector):
                doc_tfidfs[term] = vec
            doc_vectors.append(doc_tfidfs)
        self.tfidfs = doc_vectors

    def tfidf_queries(self, query):
        self.query = self.preprocess(query)
        query_tfsidfs = {}
        for term in self.query:
            query_tfsidfs[term] = self.get_tf(term,
                                              self.query) * self.get_idf(term)
        return query_tfsidfs

    def query_relevance(self, query):
        tfidf = self.tfidf_queries(query)
        query_vec = list(tfidf.values())
        doc_vecs = []
        for doc in self.tfidfs:
            doc_vec = []
            for term_query in tfidf:
                if term_query in doc:
                    doc_vec.append(doc[term_query])
                else:
                    doc_vec.append(0)
            doc_vecs.append(doc_vec)
        cosines = []
        for vec in doc_vecs:
            if np.any(vec):
                cosines.append(1 - cosine(vec, query_vec))
            else:
                cosines.append(0)
        relevance_ids = [
            text_id for _, text_id in sorted(
                zip(cosines, self.ids), key=(lambda x: x[0]), reverse=True)
        ]
        cosines.sort(reverse=True)
        most_relevant = relevance_ids[0]
        relevant_candidates = [relevance_ids[0]]
        for cos in range(1, len(cosines)):
            if cosines[0] - cosines[cos] <= 0.000001:
                relevant_candidates.append(relevance_ids[cos])
        if len(relevant_candidates) > 1:
            tiebreaker = []
            for id in relevant_candidates:
                rel_text = self.pair_dict[id]
                absent_words = 0
                for word in rel_text:
                    if word not in self.query:
                        absent_words += 1
                tiebreaker.append(absent_words)
            relevant_candidates = [
                text_id
                for _, text_id in sorted(zip(tiebreaker, relevant_candidates),
                                         key=(lambda x: x[0]))
            ]
            most_relevant = relevant_candidates[0]
        return most_relevant, cosines[0]
Ejemplo n.º 13
0
def write_tfidf_file(xmlcollection, nltk_textcollection):
    """
    Writes a tf*idf matrix file with all tf*idf values for each 
    document, row by row. The columns represent the (alphabetically
    ordered) stems available in the whole collection.
    @param xmlcollection: Collection of XML documents, type collection
    @param nltk_textcollection: NLTK TextCollection of all the stems
    """
    idf_file = get_stems_file(measure="_idf")
    avg_words_per_doc = len(xmlcollection.get_words()) / \
                        len(xmlcollection.get_docs())

    if not exists(idf_file):
        write_idf_file(xmlcollection, nltk_textcollection)

    idf_dict = DictFromFile(idf_file)
    tfidf_dict = dict()
    high_tfidf_stems = set()
    
    collection_stems = list(xmlcollection.get_stems(uniq=True))
    print "Length of collection, all stems:", len(collection_stems)
    
    # Remove most frequent (idf<2) / stop stems (or qualifying 
    # as such), and most rare stems (max(idf)), as they are of no 
    # help to separate / make up clusters
    collection_stems = get_classification_stems(collection_stems, idf_dict)
    print "Length of collection, cluster stems:", len(collection_stems)
    
    f = open(get_tfidf_matrix_file(), "w", get_def_enc())
    for doc in xmlcollection.get_docs():
        doc_stems = doc.get_stems()
        col = TextCollection("")
        
        stdout.write(doc.get_id())
        idf_row = ""
        stdout.write(" (")
        for stem in sorted(collection_stems):
            tf = col.tf(stem, doc_stems)
    
            # Reweight tf values, to get more classifcation words
            # and compensate for the very different document sizes 
            # available
            # Idea: Accounts for average document length, but also for
            # the number of times a word effectively occurs in a 
            # specific document; other variations can be thought of 
            # (using log) or maximal tf values
            # Note: The clustering works better with (in general)
            # smaller values
            if tf > 0.0:
                tf = 1.0 / avg_words_per_doc * tf
            # If nothing applies: tf is 0.0
                
            tfidf = tf*float(idf_dict[stem])
            tfidf_dict[stem] = tfidf

            # We may find here some threshold that makes sense
            if (tfidf > 0.0):
                stdout.write(stem + ", ")
                high_tfidf_stems.add(stem)
            
            idf_row += str(tfidf) + " "
        f.write(idf_row + "\n")
        stdout.write(")\n")
    f.close()
    print "List length of high value tf*idf terms:", len(high_tfidf_stems)
    
    sorted_tfidf_dict = \
        sorted(tfidf_dict.iteritems(), reverse=True,
               key=operator.itemgetter(1))
    
    f = open(get_stems_file(measure="_tfidf_sorted"), "w", get_def_enc())
    for pair in sorted_tfidf_dict: 
        f.write(str(pair[1]) + " " + pair[0] + "\n")
    f.close()
Ejemplo n.º 14
0
        ]
        s = nltk.stem.SnowballStemmer('english')
        token = [s.stem(ws) for ws in token]
        tokens.append(token)
    f = open('../Data/hair_dryer/tokens_dryer.pkl', 'wb')
    pickle.dump(tokens, f)
    f.close()

    #建立语料库
    corpus = TextCollection(tokens)

    tf = {}
    tf_idf = {}
    for review in tokens:
        for word in review:
            if word not in tf:
                tf_ = corpus.tf(word, corpus)
                tf[word] = tf_
            if word not in tf_idf:
                tf_idf_ = corpus.tf_idf(word, corpus)
                tf_idf[word] = tf_idf_

    tf_sorted = sorted(tf.items(), key=lambda item: item[1], reverse=True)
    tf_idf_sorted = sorted(tf_idf.items(),
                           key=lambda item: item[1],
                           reverse=True)

    pd.DataFrame(tf_sorted).to_csv('../Data/hair_dryer/tf_sorted_dryer.csv')
    pd.DataFrame(tf_idf_sorted).to_csv(
        '../Data/hair_dryer/tf_idf_sorted_dryer.csv')
Ejemplo n.º 15
0
    str(editdistance) + "; this may take a while!"
xmlcollection.get_words_by_editdistance(editdistance=editdistance,
                                        no_of_most_freq=no_of_topwords)

# Write the found sets to disk; also write most frequent words to disk.
xmlcollection.write_words_by_editdistance(editdistance=editdistance)
xmlcollection.write_topwords(no_of_words=no_of_topwords)
print "Top words written to disk."

# XXX: BIG F**K UP ################################## FIX FIX FIX #####

# Print idf, tf and tf-idf values for the term "CCC", in document
# no. 42 - for testing.
nltk_textcollection = TextCollection(xmlcollection.get_words())
print "idf: " + str(nltk_textcollection.idf("CCC"))
print "tf: " + str(nltk_textcollection.tf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))
print "tf_idf: " + str(nltk_textcollection.tf_idf("CCC", 
    TextCollection(xmlcollection.get_doc(42).get_tokens())))

# Do that now systematically for all documents
print "Document where tf is bigger 0:"
cnt = 0
for doc in xmlcollection.get_docs():
    tf = nltk_textcollection.tf("CCC", TextCollection(doc.get_tokens()))
    stdout.write(str(tf) + ", ")
    cnt += 1
    if cnt == 10: 
        print
    cnt = 0
    if tf > 0.0: 
        print "\n" + doc.get_xml_filename()
Ejemplo n.º 16
0
from __future__ import print_function
from nltk.corpus import PlaintextCorpusReader
from nltk.text import TextCollection

#load all the files in the corpus root,
#and calculate tf, idf, and tf_idf on them, and on a specific term

if __name__ == "__main__":
    corpus_root = '../data/source_data'
    corpus = PlaintextCorpusReader(corpus_root,'[a-zA-Z \-]*\.txt')

    ids = corpus.fileids()

    collection = TextCollection(corpus)

    #for x,word in enumerate(corpus.words(ids[0])[:200]):
    #    print(x,word)

    source = ids[0]
    term = corpus.words(source)[107]
    doc = corpus.words(ids[2])



    print("Source: ",source)
    print("TF of: ",term,": ",collection.tf(term,doc))
    print("IDF of: ",term,": ",collection.idf(term))
    print("tf_Idf of:",term,": ",collection.tf_idf(term,doc))

Ejemplo n.º 17
0
    novel_data = open(file).read()
    cleaned_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', novel_data))

    wordlist = jieba.lcut(cleaned_data)
    return wordlist


#不同种类的文本
text1 = text(file='./text/new2.txt')
#text2 = text(file='./texts/caijing.txt')
#text3 = text(file = './texts/xinwen.txt')
#text4 = text(file = './texts/keji.txt')

#将文本列表初始化为TextCollection类
mytexts = TextCollection([text1])
dict_key = {}
#遍历wordllist,与 计算机类的名词尽心匹配,选出排在最前面的几个词

wordlist = text_wordlsit(sys.argv[1])

for wod in wordlist:
    if len(wod) < 3:
        continue
    cfd = mytexts.tf(wod, text1)
    dict_key[wod] = cfd

listdic = sorted(dict_key.items(), key=lambda d: d[1], reverse=True)
print(listdic[:5])

#缺词库,貌似分词做的也不太好。