Example #1
0
def cosine_string(a, b, char=True, move=False):
    if move == False:
        t = w.tf_idf(a, b, char=char)
        return round(simi.cosine_similarity(t.A[0], t.A[1]), 15)
    else:
        t = w.tf_idf(a, b, char=char)
        cosine = round(simi.cosine_similarity(t.A[0], t.A[1]), 15)
        p = perpindahan(a, b)
        pindah = round(simi.cosine_similarity(p[0], p[1]), 15)
        return (cosine + pindah) / 2
def calculate_corpus_tf_idf(corpus_reader):
	'''
	Calculate TF*IDF weight for each document in given corpus.
	-Input: CorpusReader (either Tagged or PlainText )
	-Return:
	 (1) A dictionary whose keys=document name and values=dictionary
	 with terms for keys and TF*IDF weights for values
	 (2) A dictionary whose keys=terms and values=their IDF
	'''
	st_time = time.time()
		
	# Term Frequency for each category
	tfs_per_document = defaultdict(Counter)
	for document in corpus_reader.fileids():
		terms_in_document = corpus_reader.words(document)
		tfs_per_document[document] = tf_idf.tf(terms_in_document)
		
	# Inverse Document Frequency
	idfs = tf_idf.idf(tfs_per_document)
	
	# key is folder name, value is a list of (term, tfidf score) pairs
	tfidfs_per_document = defaultdict(defaultdict) 
	for document, tfs in tfs_per_document.iteritems():
	    tfidfs_per_document[document] = tf_idf.tf_idf(tfs, idfs, len(tfs_per_document))
		
	print "time to compute TF-IDF weights for corpus: %.3f sec" % (time.time()-st_time)
	return tfidfs_per_document, idfs
Example #3
0
def execute():
    table = tf_idf()
    text_output.delete('1.0', tk.END)
    folder_name = folder
    num_of_files = len([
        name for name in os.listdir(folder)
        if os.path.isfile(os.path.join(folder, name))
    ]) + 1
    for x in range(1, num_of_files):
        file_name = folder_name + '/' + str(x).zfill(2) + '.txt'
        table.add_file(file_name)
    top_k = entry_top_k.get()
    top_k = int(top_k)
    for x in range(1, num_of_files):
        target_file = folder_name + '/' + str(x).zfill(2) + '.txt'
        var = 'Top ' + str(top_k) + ' of tf-idf in ' + os.path.basename(
            target_file) + ' : \n'
        text_output.insert('end', var)
        var = table.get_tf_idf(target_file, top_k)
        text_output.insert('end', var)
        var = '\n\n'
        text_output.insert('end', var)
    keyword = entry_keyword.get()
    var = 'tf-idf of key word "' + keyword + ' : \n'
    text_output.insert('end', var)
    var = table.similarities([keyword])
    for x in var:
        x[0] = os.path.basename(x[0])
    text_output.insert('end', var)
    return 0
def classify_article_words(article_words, corpus_tfidfs_per_category, corpus_idf):
	'''
	Find the category that best matches the given article among the given
	categories.
	-Input: (1) list of article terms (2) TF*IDF weights for each document in
	 corpus (3) IDF for the entire corpus
	-Return: top category and a dictionary with match score for the article
	 with each category
	'''
	st_time = time.time()
	
	# Find article TF and TFIDF
	article_tfs = tf_idf.tf(article_words)
	article_tfidfs = tf_idf.tf_idf(article_tfs, corpus_idf, len(corpus_tfidfs_per_category))
	
	# find best match among categories
	sim_scores = defaultdict()
	for cat_name, cat_tfidf_scores in corpus_tfidfs_per_category.iteritems():
		cos_sim_time = time.time()
		sim_scores[cat_name] = \
			cosine_sim.cosine_similarity_dict(article_tfidfs, cat_tfidf_scores)
		
	# sort by value (match score), descending
	match = sorted(sim_scores.iteritems(), key=operator.itemgetter(1), reverse=True)[0][0]
	
	return match, sim_scores
def data_to_dataset(datas,
                    code_and_dates=None,
                    labels=None,
                    comments=None,
                    return_format="APPLY_FOR_NLTK_CLASSIFY"):
    """
    datas별로 dates와 codes를 제공 하거나
    code_and_dates 는 list속의 (code, date) 형식
    label을 내놓아야 한다.
    
    APPLY_FOR_NLTK_CLASSIFY의 경우
    참고로, 너무 최근이어서 다음날의 주가를 알수없는경우 featureset 만으로 2번 반환리스트에 속한다.
    그 이외에서는 (featureset, label) 로 1번 반환리스트에 속한다.
    """
    assert ((code_and_dates is not None) | (labels is not None))
    if not labels:
        assert (len(datas) == len(code_and_dates))
        labels = [up_or_down(*I) for I in code_and_dates]

    words_and_importent_in_docs = tf_idf.tf_idf(datas)

    def build_feature():
        return [
            sorted(I.items(), key=lambda X: (X[1], X[0]))[::-1][:5]
            for I in words_and_importent_in_docs.values()
        ]

    features = build_feature()
    if return_format == "APPLY_FOR_NLTK_CLASSIFY":
        return [({W:V for W, V in F},L, *CC) for F, L, CC in zip(features, labels, comments) if L is not None], \
        [({W:V for W, V in F},None, *CC) for F, L, CC in zip(features, labels, comments) if L is None]
    else:
        return features, labels
def data_to_dataset(datas, code_and_dates = None, labels = None, return_format = "APPLY_FOR_NLTK_CLASSIFY"):
    """
    datas별로 dates와 codes를 제공 하거나
    code_and_dates 는 list속의 (code, date) 형식
    label을 내놓아야 한다.
    
    APPLY_FOR_NLTK_CLASSIFY의 경우
    참고로, 너무 최근이어서 다음날의 주가를 알수없는경우 featureset 만으로 2번 반환리스트에 속한다.
    그 이외에서는 (featureset, label) 로 1번 반환리스트에 속한다.
    """
    assert( (code_and_dates is not None) | (labels is not None ))
    if not labels:
        assert(len(datas) == len(code_and_dates))
        labels = [ up_or_down(*I) for I in code_and_dates]
    
    words_and_importent_in_docs = tf_idf.tf_idf(datas)
    def build_feature():
        return [sorted(I.items(), key = lambda X : (X[1], X[0]))[::-1][:5] for I in words_and_importent_in_docs.values()]
    features = build_feature()
    if return_format == "APPLY_FOR_NLTK_CLASSIFY":
        """
        if you want use full feature
        but it is extremely low efficiency
        """
        return [({W:V for W, V in F},L) for F, L in zip(features, labels) if L is not None], \
        		[{W:V for W, V in F} for F, L in zip(features, labels) if L is None]
        """
        존재성만 다룰경우의 파라메터 반환
        """
        return [({W:1 for i,(W, V) in enumerate(F)},L) for F, L in zip(features, labels) if L is not None], \
        		[{W:1 for i,(W, V) in enumerate(F)} for F, L in zip(features, labels) if L is None]
    else: return features, labels
def train(outputfile, corpus):
    print("Computing most signicant ngrams for", corpus)
    tfidf = tf_idf(corpus)
    top_200 = top_n(tfidf, 200)
    print("Dumping training output...")
    with open(outputfile, 'wb') as file:
        pickle.dump(top_200, file)
    print("Output saved at", outputfile)
def get_website_scores(question):
    from tf_idf import tf_idf
    from conf.website_conf import website_list
    from website_corpus import get_corpus_list
    # 获取网站的语料库
    corpus_list = get_corpus_list(website_list)
    words, word_weights = tf_idf(corpus_list)
    for i in range(len(word_weights)):
        for j in range(len(words)):
            print words[j], word_weights[i][j]
def run_tf_idf():
    if not os.path.exists('tf_idf_top2000.xlsx'):
        print('start tf-idf method ...')
        print(type(corpus[0]))
        start_time = time.time()
        keywords_tf_idf = tf_idf(corpus)
        print(keywords_tf_idf)
        print('total time taken:', time.time() - start_time, 's')

    res_path = 'tf_idf_top2000.xlsx'

    return res_path
Example #10
0
def getConsultapadrao():
    M = [
        'O peã e o caval são pec de xadrez. O caval é o melhor do jog.',
        'A jog envolv a torr, o peã e o rei.', 'O peã lac o boi',
        'Caval de rodei!', 'Polic o jog no xadrez.'
    ]
    stopwords = ['a', 'o', 'e', 'é', 'de', 'do', 'no', 'são']
    q = 'xadrez peã caval torr'
    separadores = [' ', ',', '.', '!', '?']
    (ponderada_docs,
     ponderada_consulta) = tf_idf.tf_idf(M, stopwords, q, separadores)
    return modelo_vetorial(ponderada_docs, ponderada_consulta)
    def selector(self, method):
        """
    Selector for the choosen method.
    """
        #Init given method
        # TF-IDF
        if self.method == "tf-idf":
            self.tf_idf = tf_idf(self.data)

        # Word to vec
        elif self.method == "word2vec":
            self.word2vec = word2vec(self.data)

        elif self.method == "doc2vec":
            self.doc2vec = doc2vec(self.data)
def get_top_n_website_scores(question, n=5):
    from tf_idf import tf_idf, get_corpus_list
    from conf.website_conf import website_list
    from jieba_split import split_word_only
    from cosine import batch_get_sort_scores
    from wikipedia_expansion import get_question_expansion_corpus
    if debug_flag:
        print 'question:', question

    website_corpus = get_corpus_list(website_list)
    question_corpus = get_question_expansion_corpus(question)
    # question_corpus = split_word_only(question)
    website_corpus.append(question_corpus)
    words, words_weight = tf_idf(website_corpus)

    return batch_get_sort_scores(words_weight, website_list)[0:n]
Example #13
0
def test():
    docs = [
        '我 今天 心情 很好,但是 看到 不想 看到 的 人 了', '我 今天 心情 很好,且 看到 想 看到 的 人 了',
        '我 今天 心情 很 不好,但是 看到 想 看到 的 人 了', '我 今天 心情 很 不好,且 看到 不想 看到 的 人 了',
        '我 今天 心情 很好,且 看到 想 看到 的 人 了'
    ]
    from tf_idf import tf_idf
    words, words_weight = tf_idf(docs)
    for i, x in enumerate(words_weight):
        print docs[i]
        for j, y in enumerate(words_weight[i]):
            print words[j], words_weight[i][j]
        print

    for i in range(0, 4):
        score = get_cossimi(words_weight[i], words_weight[4])
        print 'score', i, ':', score
        print
Example #14
0
def common_emoji_cl(fname,m,names=False):
	#high tf-idf emoji of each cluster
	print 'Extract terms from %s'%fname
	[emoji,emoji_cl]=extract_data(fname)
	N=len(emoji)
	k=len(emoji_cl)
	N_cl=[len(emoji_cl[i]) for i in xrange(1,k+1)]

	print 'Count emoji tf'
	counter=Counter()
	counter=tf_idf.count(emoji,counter,ref=emoji_dict,type='emoji')
	counter_cl=[tf_idf.tf(counter,emoji_cl[i+1],ref=emoji_dict,type='emoji') for i in xrange(k)]

	print 'Calculate cluster emoji tf-idf'	
	tfIdf=[tf_idf.tf_idf(counter,counter_cl[i+1],N,N_cl[i]) for i in xrange(k)]
	term=tfIdf[0][0]
	tfIdf=[tfIdf[i][1] for i in xrange(k)]

	print 'Write results'
	write_common_emoji_cl(fname,mtx,names)
def lang_detect(inputfolder, corpus):
    for dirpath, dirnames, filenames in os.walk(inputfolder):
        for inputfile in corpus:
            lang_scores = defaultdict(int)
            ngrams_input = tf_idf([inputfile])
            for filename in filenames:
                path = dirpath + "/" + filename
                # Open languages profiles for comparison:
                with open(path, 'rb') as file:
                    ngrams = pickle.load(file)
                    for ngram, tfidf in ngrams:
                        if ngram in ngrams_input:
                            lang_scores[path] += tfidf * ngrams_input[ngram]
            # Language detection:
            best = (None, 0)
            for lang, score in lang_scores.items():
                print(f'\t{lang}: {score:.10f}')
                if score > best[1]:
                    best = (lang, score)
            print(f'Detected {inputfile} to be from this language: {best[0]}\n')
        break  # Only toplevel dir
Example #16
0
def merge(filepath=""):  #path of chunk file
    #merges the feature vectors together so that they can be used in Naive bayes
    #return a dictionary ( sentenceid: featureVector)
    tree = ET.parse(filepath)
    root = tree.getroot()
    coherence_dict = centroid_coherence(root)  #sentid: coherence
    tf_isf_dict = tf_isf(root, 1)  #sentid: total tf_isfscore
    tf_idf_dict = tf_idf(root)
    length_dict, position_dict = sentence_length_position(root)
    title_simm_dict = title_simm_main1(filepath)
    is_question_dict = is_question(root)
    #print title_simm_dict
    ans = {}
    for sentid in coherence_dict.iterkeys():
        d = {
            "coherence": coherence_dict[sentid],
            "tfIsf": 0,
            "tfIdf": 0,
            "length": 0,
            "position": 0,
            "titleSimm": 0,
            "isQues": 0
        }
        if tf_isf_dict.has_key(sentid):
            d["tfIsf"] = tf_isf_dict[sentid]
        if tf_idf_dict.has_key(sentid):
            d["tfIdf"] = tf_idf_dict[sentid]
        if length_dict.has_key(sentid):
            d["length"] = length_dict[sentid]
        if position_dict.has_key(sentid):
            d["position"] = position_dict[sentid]
        if title_simm_dict.has_key(sentid):
            d["titleSimm"] = title_simm_dict[sentid]
        if is_question_dict.has_key(sentid):
            d["isQues"] = is_question_dict[sentid]
        ans[sentid] = d

    return ans
def run_intersection():
    print('start intersection ...')
    start_time = time.time()
    if not os.path.exists('tf_idf_top2000.xlsx'):
        # tf-idf
        print('start tf-idf method ...')
        start_time = time.time()
        keywords_tf_idf = tf_idf(corpus)
        print(keywords_tf_idf)
        print('total time taken:', time.time() - start_time, 's')

    if not os.path.exists('chi_square_top2000.xlsx'):
        # chi_square
        print('start chi_square method ...')
        start_time = time.time()
        keywords_chi_square = chi_square(DATA_PATH, DICT_PATH, corpus)
        print(keywords_chi_square)
        print('total time taken:', time.time() - start_time, 's')

    if not os.path.exists('word2vec_huffman_top2000_multiprocessing.xlsx'):
        # word2vec extra data processing
        categories = len(corpus)
        for i in range(categories):
            corpus[i] = corpus[i].split()

        # word2vec_huffman_softmax
        print('start huffman method ...')
        start_time = time.time()
        keywords_huffman = word2vec_huffman(corpus, MODEL_HUFFMAN_PATH)
        print(keywords_huffman)
        print('total time taken:', time.time() - start_time, 's')

    tf_idf_res = pd.read_excel('tf_idf_top2000.xlsx', sheet_name=None)
    word2vec_res = pd.read_excel(
        'word2vec_huffman_top2000_multiprocessing.xlsx', sheet_name=None)
    chi_square_res = pd.read_excel('chi_square_top2000.xlsx', sheet_name=None)
    sheet_names = list(tf_idf_res.keys())
    sheet_num = len(sheet_names)
    writer = pd.ExcelWriter('intersection.xlsx')
    for i in range(sheet_num):
        new_sheet = []
        this_tf_idf = tf_idf_res[sheet_names[i]]
        this_word2vec = word2vec_res[sheet_names[i]]
        this_chi_square = chi_square_res[sheet_names[i]]
        for word in this_word2vec.values.tolist():
            word = word[0]
            if word not in new_sheet:
                new_sheet.append(word)
        for word in this_chi_square.values.tolist():
            word = word[0]
            if word not in new_sheet:
                new_sheet.append(word)
        for word in this_tf_idf.values.tolist():
            word = word[0]
            if word not in new_sheet:
                new_sheet.append(word)
        intersection = pd.DataFrame({'word': new_sheet})
        intersection.to_excel(writer, sheet_name=sheet_names[i], index=None)
        writer.save()
        writer.close()

    print('total time taken:', time.time() - start_time, 's')

    res_path = 'intersection.xlsx'

    return res_path
Example #18
0
def tf_idf_cl(term_cl,user,counter,top_user,N,term_remove):
	''''''
	names=list(set(user))
	m=len(user)
	count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names]
	idx=np.argsort(count)[::-1]
	if top_user>len(idx):
		top_user=len(idx)
		print '---- cluster has less than %d unique users'%top_user
	names=[names[i] for i in idx[:top_user]]
	term_user=init_clusters(len(names),names)	
	for name in names:
		term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name]
	''''''
	print '-- Count cls-term tf'
	counter_i=copy.copy(counter)
	counter_i.subtract(counter)
	counter_term=tf_idf.tf(counter_i,term_cl,'term')

	''''''
	print '-- Count user-term tf'
	counter_user=init_clusters(len(names),names)
	for name in names:
		counter_i=copy.copy(counter_term)
		counter_i.subtract(counter_term)
		counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term')
	''''''
	print '-- Clean counters'
	if '' in counter.keys():
		del counter['']
		del counter_term['']
		''''''
		for name in names:
			del counter_user[name]['']
		''''''
	
	for term in counter_term:
		if counter_term[term]==0:
			term_remove+=[term]
	for term in term_remove:
		del counter_term[term]
		''''''
		for name in names:
			del counter_user[name][term]
		''''''

	
	print '-- Calculate cls-term tfidf'
	term_tfidf=tf_idf.tf_idf(counter,counter_term,N)
	''''''
	print '-- Calculate user-term tfidf'
	user_tfidf=init_clusters(len(names),names)
	for name in names:
		user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m)
	''''''
	print '-- Calculate term norm-tfidf'
	#user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	#user_tfidf=[0 for term in term_tfidf]

	term=term_tfidf.keys()
	n=len(term)
	tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)]
	
	return [(term[i],tfidf[i]) for i in xrange(n)]
Example #19
0
def tf_idf_test(fname,cl=0,sep='\t'):
	data=file('txt\\'+fname+'.txt').readlines()[1:]
	n=len(data)
	data=[data[i][:-1].split(sep) for i in xrange(n)]
	user=[data[i][1] for i in xrange(n) if int(data[i][0])==cl]
	term=[data[i][-1].split(',') for i in xrange(n)]
	term_cl=[term[i] for i in xrange(n) if int(data[i][0])==cl]

	names=list(set(user))
	m=len(user)
	count=[len([user[i] for i in xrange(m) if user[i]==name]) for name in names]
	idx=np.argsort(count)[::-1]
	#N=int(len(idx)*0.1)
	N=50
	names=[names[i] for i in idx[:N]]
	term_user=init_clusters(len(names),names)	
	for name in names:
		term_user[name]=[term_cl[i] for i in xrange(m) if user[i]==name]

	print 'Count all tf'
	counter=Counter()
	counter=tf_idf.tf(counter,term,'term')

	print 'Count cl tf'
	counter_i=copy.copy(counter)
	counter_i.subtract(counter)
	counter_term=tf_idf.tf(counter_i,term_cl,'term')

	remove=[]
	for term in counter_term:
		if counter_term[term]==0:
			remove+=[term]
	for term in remove:
		del counter_term[term]

	print 'Count user tf'
	print '#_name',len(names)
	counter_user=init_clusters(len(names),names)
	for name in names:
		print '-- %s'%name
		counter_i=copy.copy(counter_term)
		counter_i.subtract(counter_term)
		counter_user[name]=tf_idf.tf(counter_i,term_user[name],'term')
	
	print 'Calculate cl tfidf'
	term_tfidf=tf_idf.tf_idf(counter,counter_term,n)

	print 'Calculate user tfidf'
	user_tfidf=init_clusters(len(names),names)
	for name in names:
		user_tfidf[name]=tf_idf.tf_idf(counter_term,counter_user[name],m)
	
	print 'Sort tfidf'
	user_tfidf=[max([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	#user_tfidf=[np.std([user_tfidf[name][term] for name in names]) for term in term_tfidf]
	
	term=term_tfidf.keys()
	n=len(term)
	tfidf=[term_tfidf[term[i]]/(1+user_tfidf[i]) for i in xrange(n)]
	term_tfidf=[term_tfidf[term[i]] for i in xrange(n)]

	f=os.open('txt\\tfidf_test.txt', os.O_RDWR|os.O_CREAT)
	idx=np.argsort(term_tfidf)[::-1]	
	os.write(f,'term_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n')
	os.write(f,','+','.join(['%0.4f'%term_tfidf[i] for i in idx[:10]])+'\n')

	idx=np.argsort(user_tfidf)[::-1]	
	os.write(f,'user_tfidf,'+','.join([term[i] for i in idx[:10]])+'\n')
	os.write(f,','+','.join(['%0.4f'%user_tfidf[i] for i in idx[:10]])+'\n')
	
	idx=np.argsort(tfidf)[::-1]	
	os.write(f,'tfidf,'+','.join([term[i] for i in idx[:10]])+'\n')
	os.write(f,','+','.join(['%0.4f'%tfidf[i] for i in idx[:10]])+'\n')
	
	os.close(f)
Example #20
0
def compare_bars(bar,lat1,lon1,miles):
	con = connect("bar_data_test.db") 
	c1 = con.cursor()
	print bar
	review11=c1.execute("SELECT REVIEW FROM BARS WHERE name like '%s' " %bar)
	review1=c1.fetchone()
	#print review1
	for review in review1:
		r1=review
	
		#print r1
	old_category1=c1.execute("SELECT CATEGORY FROM BARS WHERE name like '%s'" %bar)
	old_category=c1.fetchone()
	c=[category_new for category_new in old_category]

	c=clean_content(c).lower()
	print c
#**************	
	def unique_list(l):
		ulist = []
		[ulist.append(x) for x in l if x not in ulist]
		return ulist
	
	comp=' '.join(unique_list(c.split()))
	comp = comp.split()
	categry_match=[]
	comp_list=[]
	sort_before=[]
	length=''
	#print comp
	def words_in_string(word_list, a_string):
		return set(word_list).intersection(a_string.split())
	rows = c1.execute("SELECT NAME, CATEGORY	from BARS ")
	for row in rows:
		print row[1]
		i=0
		for word in words_in_string(comp, row[1]):
			print(word)
			i+=1
		categry_match.append([i,row[0]])
	categry_match.sort(key=lambda x: x[0], reverse=True)
	j=0
	while j<5:
		bar_match = categry_match[j][1]
		review22=c1.execute("SELECT REVIEW FROM BARS WHERE NAME like '%s'" %bar_match)
		review2=c1.fetchall()
		for review in review2:
			r2=clean_content(review)	
			#print r1,r2
		compare_bar=tf_idf.tf_idf(r1,r2)[0]
		comp_list.append(compare_bar[1])
		sort_before.append(bar_match)
		j+=1
	comp=comp_list	
	bar_list=sort_before
	bars=[]	
	p=zip(comp,bar_list)
	#print p
	x=sorted(p, key=itemgetter(0),reverse=True)
	print x

	c1.execute('DROP TABLE IF EXISTS BAR_MATCH')
	con.commit()
	sql="""CREATE TABLE `BAR_MATCH` (
			NAME  TEXT,
			LATTITUDE TEXT,
			LONGITUDE TEXT
			)"""


	c1.execute(sql)
	
	for row in x:
		bars=row[1]
		y11=c1.execute("SELECT NAME, ADDRESS FROM BARS WHERE NAME like '%s'" %bars)
		y=c1.fetchall()
		
		for y1 in y:
			print y1
			lattitude, longitude = getDistance.findLocation(y1[1]) 
			print lattitude, longitude
			c1.execute("insert into BAR_MATCH values (?,?,?)",(y1[0],longitude, lattitude))
			con.commit()
Example #21
0
def compare_bars(bar, lat1, lon1, miles):
    con = connect("bar_data_test.db")
    c1 = con.cursor()
    print bar
    review11 = c1.execute("SELECT REVIEW FROM BARS WHERE name like '%s' " %
                          bar)
    review1 = c1.fetchone()
    #print review1
    for review in review1:
        r1 = review

        #print r1
    old_category1 = c1.execute(
        "SELECT CATEGORY FROM BARS WHERE name like '%s'" % bar)
    old_category = c1.fetchone()
    c = [category_new for category_new in old_category]

    c = clean_content(c).lower()
    print c

    #**************
    def unique_list(l):
        ulist = []
        [ulist.append(x) for x in l if x not in ulist]
        return ulist

    comp = ' '.join(unique_list(c.split()))
    comp = comp.split()
    categry_match = []
    comp_list = []
    sort_before = []
    length = ''

    #print comp
    def words_in_string(word_list, a_string):
        return set(word_list).intersection(a_string.split())

    rows = c1.execute("SELECT NAME, CATEGORY	from BARS ")
    for row in rows:
        print row[1]
        i = 0
        for word in words_in_string(comp, row[1]):
            print(word)
            i += 1
        categry_match.append([i, row[0]])
    categry_match.sort(key=lambda x: x[0], reverse=True)
    j = 0
    while j < 5:
        bar_match = categry_match[j][1]
        review22 = c1.execute("SELECT REVIEW FROM BARS WHERE NAME like '%s'" %
                              bar_match)
        review2 = c1.fetchall()
        for review in review2:
            r2 = clean_content(review)
            #print r1,r2
        compare_bar = tf_idf.tf_idf(r1, r2)[0]
        comp_list.append(compare_bar[1])
        sort_before.append(bar_match)
        j += 1
    comp = comp_list
    bar_list = sort_before
    bars = []
    p = zip(comp, bar_list)
    #print p
    x = sorted(p, key=itemgetter(0), reverse=True)
    print x

    c1.execute('DROP TABLE IF EXISTS BAR_MATCH')
    con.commit()
    sql = """CREATE TABLE `BAR_MATCH` (
			NAME  TEXT,
			LATTITUDE TEXT,
			LONGITUDE TEXT
			)"""

    c1.execute(sql)

    for row in x:
        bars = row[1]
        y11 = c1.execute(
            "SELECT NAME, ADDRESS FROM BARS WHERE NAME like '%s'" % bars)
        y = c1.fetchall()

        for y1 in y:
            print y1
            lattitude, longitude = getDistance.findLocation(y1[1])
            print lattitude, longitude
            c1.execute("insert into BAR_MATCH values (?,?,?)",
                       (y1[0], longitude, lattitude))
            con.commit()
Example #22
0
pruned_data_5.reset_index(inplace=True)

#pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1)))
pairs_user = list(sample_users(pruned_data_5, 5, 12000, xf.SampleN(1)))
pickle_out = open("sample_user.pickle", "wb")
pickle.dump(pairs_user, pickle_out)
pickle_out.close()

truth = pd.concat((p.test for p in pairs_user))
#truth.to_csv(r'results/steam/pruned_5.csv')


def algo_eval(path, algo, dataset):
    evaluation = batch.MultiEval(path=path, predict=False, recommend=100)
    evaluation.add_algorithms(algos=algo)
    evaluation.add_datasets(data=dataset)
    evaluation.run()


algo_ii = item_knn.ItemItem(20, center=False, aggregate='sum')
#algo_uu = user_knn.UserUser(30, center=False, aggregate='sum')
algo_pop = basic.Popular()
algo_mf = ImplicitMF(40)
algo_bpr = BPR()
algo_tf_idf = tf_idf()
algo_LDA = LDA()

algo_eval('results/steam/all_algo_sample_user',
          [algo_LDA, algo_tf_idf, algo_ii, algo_pop, algo_mf, algo_bpr],
          pairs_user)
Example #23
0
import tkinter as tk
import tkinter.filedialog as tkfd
import os
from tkinter.scrolledtext import ScrolledText
from tf_idf import tf_idf

table = tf_idf()

main_window = tk.Tk()
main_window.title("TD-IDF")
main_window.geometry("1250x620")


def open_filedialog():
    global folder
    folder = tkfd.askdirectory(initialdir=os.path.dirname(__file__) + '/..', )
    entry_dir_name.delete(0, tk.END)
    entry_dir_name.insert(0, folder)
    print(folder)
    return 0


def execute():
    table = tf_idf()
    text_output.delete('1.0', tk.END)
    folder_name = folder
    num_of_files = len([
        name for name in os.listdir(folder)
        if os.path.isfile(os.path.join(folder, name))
    ]) + 1
    for x in range(1, num_of_files):
Example #24
0
from morphological_analysis import analysis
from tf_idf import tf_idf

BLOG = 'mana_blog.csv'
LEMMAS = '/home/output/mana_lemmas.csv'

analysis(BLOG, LEMMAS)
tf_idf(LEMMAS)
@app.route('/search/results', methods=['GET', 'POST'])
def search_request():
    # print(request.form["input"])
    search_term = request.form.get("input")
    # search_term = flask.request.args.get('name')
    Q = cosine_similarity(books_data=books_data,
                          DF=DF,
                          tf_idf=tf_idf,
                          total_vocab=total_vocab,
                          total_vocab_size=total_vocab_size,
                          k=10,
                          query=search_term)
    print(Q)
    return render_template('results.html', res=Q)


# def index():
#     return render_template('index.html', variable = Q)

if __name__ == "__main__":
    load_data = False
    if not load_data:
        books_data = load_file()
        N = books_data.shape[0]
        processed_bookname, processed_text = process_data(books_data)
        DF, total_vocab_size, total_vocab = build_DF(N, processed_text,
                                                     processed_bookname)
        tf_idf, df = tf_idf(N, processed_text, processed_bookname)
    # Q = cosine_similarity(books_data = books_data,DF = DF, tf_idf = tf_idf,total_vocab = total_vocab, total_vocab_size = total_vocab_size, k = 10, query = "The evening of the day on which Mr Gibson had been to see the squire")
    app.run(debug=True)
Example #26
0
print(train_x.shape)
print(valid_x.shape)
print(train_y.shape)
print(valid_y.shape)
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

#Count vectorizing the training data
train = []
train = bow.bag_of_words(train_data, X, train_x, valid_x)
xtrain_count = train[0]
xvalid_count = train[1]
#Tf-IDF of training data
tf_idf = []
tf_idf = tfi.tf_idf(train_data, X, train_x, valid_x)
xtrain_tfidf = tf_idf[0]
xvalid_tfidf = tf_idf[1]
xtrain_tfidf_ngram = tf_idf[2]
xvalid_tfidf_ngram = tf_idf[3]
xtrain_tfidf_ngram_chars = tf_idf[4]
xvalid_tfidf_ngram_chars = tf_idf[5]

#Naive Bayes classifier implementation
nb.Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y, my_tags,
               xtrain_tfidf, xvalid_tfidf, xtrain_tfidf_ngram,
               xvalid_tfidf_ngram, xtrain_tfidf_ngram_chars,
               xvalid_tfidf_ngram_chars)
#Bernoulli Naive Bayes classifier implementation
bnb.Bernoulli_Naive_Bayes(xtrain_count, xvalid_count, train_y, valid_y,
                          my_tags, xtrain_tfidf, xvalid_tfidf,
Example #27
0
from Co_Occurrence import CoOccur
from cosine import cosine_sim
from tf_idf import tf_idf
from Co_Occurrence import docFreq
import json
import timeit
from file_read_write import file_read_write


query = input()
start = timeit.default_timer()
coOccur_obj = CoOccur(None)  # a coOccur temporary object to tokenize the query
query_tf = tf_idf()
query_list = list(query.split(" "))
query_list = coOccur_obj.spell_check(query_list)
empty_str = " "
query = empty_str.join(query_list)
query_refined = coOccur_obj.tokenize(query)  # tokenize query,stem and remove stop words
print(query_refined)
query_refined.sort()


file_reader_object = file_read_write()

# search query in cache
file_reader_object.cache_reader(query_refined, start)

# read dataset of .txt files into dataframe
df = file_reader_object.dataset_reader()

# df = pd.DataFrame(data, columns=['headline', 'brief', 'article', 'type', 'filename'])
Example #28
0
def results(algo=None):
    print("algorithm:", algo if algo in ALGOS else None)

    samples = 500  # up to 2000
    print("sample size:", samples)

    keys = glob.glob('Inspec/keys/*.key')
    res = [0] * samples

    if algo == 'textrank':
        # load a spaCy model, depending on language, scale, etc.
        nlp = spacy.load("en_core_web_sm")
        # add PyTextRank to the spaCy pipeline
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    elif algo == 'sentiment_pos' or algo == 'sentiment_pos_tfidf':
        sid = SentimentIntensityAnalyzer()

    for i, key in enumerate(keys[:samples]):
        # get actual keywords
        key_file = open(key)
        whitespace = re.compile(r"\s+")
        # remove whitespace and convert to lowercase
        actual = [
            whitespace.sub(" ", w).strip().lower()
            for w in key_file.readlines()
        ]

        # get text document corresponding to current key
        num = re.findall(r'\d+', key)[0]
        doc = 'Inspec/docsutf8/{}.txt'.format(num)

        # get extracted keywords
        if algo == 'rake':
            extracted = rake(doc)
        elif algo == 'textrank':
            extracted = textrank(doc, nlp)
        elif algo == 'window':
            extracted = window(doc)
        elif algo == 'window_w_tf_idf':
            extracted = window_w_tf_idf(doc)
        elif algo == 'tf_idf':
            extracted = tf_idf(doc)
        elif algo == 'sentiment_pos':
            extracted = sentiment_pos(doc, sid)
        elif algo == 'sentiment_pos_tfidf':
            extracted = sentiment_pos_tfidf(doc, sid)
        else:
            extracted = extract(doc)

        # calculate results
        tp = len(set(extracted).intersection(
            set(actual)))  # number of true positives
        precision = tp / len(extracted)
        recall = tp / len(actual)
        f_measure = (2 * precision * recall) / (
            precision + recall) if precision + recall else 0
        res[i] = (precision, recall, f_measure)

    # calculate average results
    avg_res = [sum(x) / len(x) for x in zip(*res)]
    print("precision: {}, recall: {}, F-measure: {}".format(*avg_res))
Example #29
0
        for j, y in enumerate(words_weight[i]):
            print words[j], words_weight[i][j]
        print

    for i in range(0, 4):
        score = get_cossimi(words_weight[i], words_weight[4])
        print 'score', i, ':', score
        print


if __name__ == '__main__':

    # question = '我在哪里可以吃到海鲜意面'
    question = '怎么学好编程'

    from tf_idf import tf_idf, get_corpus_list
    from conf.website_conf import website_list
    website_corpus = get_corpus_list(website_list)
    from wikipedia_expansion import get_question_expansion_corpus
    # question_corpus = get_question_expansion_corpus(question)

    from jieba_split import split_word_only
    question_corpus = split_word_only(question)

    website_corpus.append(question_corpus)
    words, words_weight = tf_idf(website_corpus)
    scores = batch_get_sort_scores(words_weight, website_list)

    for i in scores:
        print i[0], ':', i[1]
Example #30
0
def getXY(input, algo, model, test=0, k=25):
    """
    input: 预处理过的语料库
    algo: 使用的特征权重计算方法名
    model: 使用的模型名

    test = 0 : 记录文件中出现的词汇并构造词汇表(训练集)
    test = 1 : 不构造词汇表,用已经构造好的(测试集)

    """
    global package

    # global voca_list
    # global labelset_list
    # global vocafreq_list
    # global weights_list
    # global doclist
    # global docname

    corpus = preprocess(input, package, test, k)
    labelset = package["labelset"]  # 获得preprocess确定的package
    voca = package["voca"]

    level = 2
    mod = 0
    if algo == "tf_idf":
        weights = tf_idf(corpus, test, package)
        mod = 1
    elif algo == "tf_dc":
        weights = tf_dc(corpus, test, package)
    elif algo == "tf_bdc":
        weights = tf_bdc(corpus, test, package)
    elif algo == "iqf_qf_icf":
        weights = iqf_qf_icf(corpus, test, package)
    elif algo == "tf_eccd":
        weights = tf_eccd(corpus, test, package)
    elif algo == "tf_ig":
        weights = tf_ig(corpus, test, package)
    elif algo == "tf_rf":
        weights = tf_rf(corpus, test, package)
        level = 3
    elif algo == "tf_chi":
        weights = tf_chi(corpus, test, package)
        level = 3
    elif algo == "tf_mrf":
        weights = tf_mrf(corpus, test, package)
        level = 3
    elif algo == "tf_nrf":
        weights = tf_nrf(corpus, test, package)
        level = 3
    elif algo == "tf_vc":
        weights = tf_vc(corpus, test, package)

    # print weights
    X = []
    Y = []  # 标签集
    count = 0
    vocalen = len(voca)
    for doc in corpus:
        if count % 1000 == 0:
            print(str(count) + "/" + str(len(corpus)))
            # print('weights\'s size:')
            # print(sys.getsizeof(weights))
            # print(sys.getsizeof(X))
            # process = psutil.Process(os.getpid())
            # print('Used Memory:', process.memory_info().rss / 1024 / 1024, 'MB')
            # print(memory_usage_psutil())
        count += 1

        # process label
        labelset.append(doc["label"])
        Y.append(int(np.argmax(one_hot(
            labelset)[-1])))  # 在确定的labelset中添加label,以保证label的位置一致,再进行截取
        # np.argmax返回最大数的索引
        labelset = labelset[:-1]  # 重置labelset

        # process word
        temvocalist = list(voca) + list(
            doc["split_sentence"])  # 与label同理  voca用以确定位置

        tem_one_hot = one_hot(temvocalist)[vocalen:]  # 截取

        # for word in range(len(tem_one_hot)):  # .shape[0]
        for word in range(tem_one_hot.shape[0]):
            temlabel = doc["label"]  # earn
            temword = doc["split_sentence"][word]
            temdoc = doc["document"]  # earn638

            # print("\ntem_one_hot:")
            # print(tem_one_hot)
            # print("\n")

            # weights--词频*权重
            if level == 2:
                if mod == 0:  # 有监督
                    tem_one_hot[word] *= weights[temlabel][temword]
                else:  # 无监督
                    tem_one_hot[word] *= weights[temdoc][temword]
            else:
                tem_one_hot[word] *= weights[temlabel][temdoc][temword]

        # 空array
        try:
            tem_one_hot = np.max(tem_one_hot, axis=0)  # 去除多余行  每列只保留最大数
        except ValueError:
            # tem_one_hot = tem_one_hot[0]
            # print(tem_one_hot)
            pass

        if model.lower() == "knn":
            tem_one_hot = preprocessing.normalize(
                np.array(tem_one_hot).reshape(1, -1), norm='l2')  # 转变为矩阵

        # print(tem_one_hot.toarray())
        # tem_one_hot = np.full(tem_one_hot)
        # print(tem_one_hot)  # 稀疏矩阵转化回原矩阵!
        # print(type(tem_one_hot.toarray()))

        X.append(np.squeeze(tem_one_hot.toarray().tolist()))
        # print(tem_one_hot.toarray().tolist())

        # X.append(tem_one_hot)

    # print(np.array(X))
    # print(Y)

    return X, Y  # squeeze压缩维度  如将二维转变为一维
def compare_restaurants(rest,lat1,lon1,miles):
	con=connect("restinfo.db")
	c1=con.cursor()
	rest=rest.replace("'", "")
	review1=c1.execute("SELECT REVIEW FROM RESTAURANTS WHERE name like '%s' "%rest)

	for review in review1:
		r1=review[0]

	old_category=c1.execute("SELECT CATEGORY FROM RESTAURANTS WHERE name like '%s'"%rest) 	
	old_category=c1.fetchone()                                                                         
                                           
	for category_new in old_category: 
		c=category_new.lower()
	print c  

	comp=''
	rest_list=''
	comp_list=[]
	sort_before=[]	
	length=''

	category_match=c1.execute("SELECT NAME FROM RESTAURANTS WHERE CATEGORY like '%s'" %c)
	myqueryrecords = c1.fetchall()
	i=0
	length=len(myqueryrecords)
	while i<len(myqueryrecords):
		rest_match=myqueryrecords[i][0]
		print rest_match
		c1.execute("SELECT ADDRESS FROM RESTAURANTS WHERE NAME like '%s'" %rest_match)
		rest_address = c1.fetchone()
		rest_add=rest_address[0]
		print rest_add
		rest_match=rest_match.replace("'", "")
		lat2,lon2 = findLocation(rest_add)
		#print lat2, lon2
		rest_dist = findDistance(lat1, lon1, lat2, lon2)
		print rest_dist
		print miles
		if rest_dist < float(miles):
			print rest_match
			review2=c1.execute("SELECT REVIEW FROM RESTAURANTS WHERE NAME like '%s'" %rest_match)
			for review in review2:
				r2=review[0]	
			compare_rest=tf_idf.tf_idf(r1,r2)[0]
			comp_list.append(compare_rest[1])
			sort_before.append(myqueryrecords[i][0])
			comp=comp_list	
			rest_list=sort_before
		restaurants=[]	
		p=zip(comp,rest_list)
		x=sorted(p, key=itemgetter(0),reverse=True)
		i+=1
	c1.execute('DROP TABLE IF EXISTS RESTAURANTS_MATCH')
	con.commit()
	sql="""CREATE TABLE RESTAURANTS_MATCH (
			NAME  FLOAT,
			LATTITUDE FLOAT,
			LONGITUDE FLOAT,
			ADDRESS FLOAT
			)"""
	print x
	c1.execute(sql)
	
	for row in x:
		print row[1]
		restaurants=row[1].replace("'", "")
		y11=c1.execute("SELECT NAME, ADDRESS FROM RESTAURANTS WHERE NAME like '%s'" %restaurants)
		y=c1.fetchall()
		for y1 in y:
			latitude, longitude = findLocation(y1[1])
			print y1[0], latitude, longitude
			c1.execute("insert into RESTAURANTS_MATCH (name, lattitude,longitude,address) VALUES (?,?,?,?)",(y1[0],latitude,longitude,y1[1]))
			con.commit()
Example #32
0
        #print("\n"+str(x) +":"+str(len(fileContent)))
        sentenceList = fileContent.split((u"ред"))
        sentenceListSum = fileContentSummary.split((u"ред"))
        no_of_sentence = len(sentenceList)
        for y in range(0, no_of_sentence):
            multi = math.pow(10, len(str(y)))
            sentId = ((x * multi) + y) / multi
            sentenceDict[sentId] = sentenceList[y]
            #sentenceDict[((x * multi) + y ) / multi] = sentenceList[y]
            if (sentenceDict[sentId] in sentenceListSum):
                sentenceDictLabel[sentId] = 'Y'
            else:
                sentenceDictLabel[sentId] = 'N'
            sentenceDictLen[sentId] = len(sentenceDict[sentId].split())
    #call for tf-idf
    res_idf = tf_idf(sentenceDict)
    #call for tf-isf
    res_isf = tf_isf(sentenceDict)
    #merge into one dictionary
    featureVec = merge(sentenceDict)

    # ----TestData Set-------------------------------------
    #read test data
    fileNameTest = 'complete_corpus\\testFile\\testInput' + str(1) + ".txt"
    #fileContentTest = read_from_file(fileNameTest)
    fileContentTest = tokenize_testFile(fileNameTest)

    no_of_sentence_test = len(fileContentTest)
    for y in range(0, no_of_sentence_test):
        multi = math.pow(10, len(str(y)))
        sentenceDictTest[((1 * multi) + y) / multi] = fileContentTest[y]
 elif "adaboo" in a[1]:
     adaboo_flag = 1
 elif "rf" in a[1]:
     rf_flag = 1
 elif "nb" in a[1]:
     nb_flag = 1
 elif "bagging" in a[1]:
     bagging_flag = 1
 
 training_matrix = input.input("5k_spring_2016_training_dataset.txt", 15000, 40293)
 testing_matrix = input.input("5k_spring_2016_testing_dataset.txt", 15000, 40293)
 training_label = input.label("5k_spring_2016_label_training.txt", 15000)
 
 training_matrix, testing_matrix, combine = input_preprocess(training_matrix, testing_matrix)
 #Getting tf-idf of the matrix
 tf_idf_combine =  tf_idf.tf_idf(combine, combine)
 tf_idf_training_matrix = tf_idf.tf_idf(combine, training_matrix)
 tf_idf_testing_matrix = tf_idf.tf_idf(combine, testing_matrix)
 
 if (lca_flag):
     print("Doing LCA")
     train = lca(tf_idf_combine, tf_idf_training_matrix)
     test = lca(tf_idf_combine, tf_idf_testing_matrix)
 elif (pca_flag):
     print("Doing PCA")
     train = pca_preprocess(tf_idf_combine, tf_idf_training_matrix)
     test = pca_preprocess(tf_idf_combine, tf_idf_testing_matrix)
 else:
     print("Doing No Reduction")
     train = tf_idf_training_matrix
     test = tf_idf_testing_matrix
Example #34
0
    # text_process.save_dict_dict(patent2title_stem_freq, "patent_title_stem_freq")
    # text_process.save_dict(stem2term, "stem2term")

    # save end

    title_stem_freq = load_dict("title_freq")
    abstr_stem_freq = load_dict("abstr_freq")
    patent2abstr_stem_freq = load_dict_dict("patent_abstr_stem_freq")
    patent2title_stem_freq = load_dict_dict("patent_title_stem_freq")
    stem2term = load_dict("stem2term")

    # print title_stem_freq
    # print abstr_stem_freq
    # print patent2title_stem_freq

    patent2descr_stem_score = tf_idf.tf_idf(patent2abstr_stem_freq)
    patent2title_stem_score = tf_idf.tf_idf(patent2title_stem_freq)

    patent2stem_score = {}
    for patent, stem_score in patent2descr_stem_score.items():
        if patent not in patent2stem_score.keys():
            patent2stem_score[patent] = {}

        for k, v in stem_score.items():
            if k in patent2stem_score[patent].keys():
                patent2stem_score[patent][k] += v
            else:
                patent2stem_score[patent][k] = v

    # for patent, stem_score in patent2title_stem_score.items():
    #     if patent not in patent2stem_score.keys():
Example #35
0
		if term in x2_by_term:
			x2_by_term[term] += x2
		else:
			x2_by_term[term] = x2

# divide by categories to get average
print "calculating average MI, X2 for terms"
num_cats = len(cats)

for term in mi_by_term:
	mi_by_term[term] /= num_cats
for term in x2_by_term:
	x2_by_term[term] /= num_cats

# for each term: compute TF-IDF, FREQ
tf_idf_by_term, freq_by_term = tf_idf.tf_idf(list(terms), doc_terms, vocab_size, set_select)

# save (term, value) pairs: MI
print "saving (term, value) pairs: MI"
top_mi = sorted(mi_by_term.items(), key = lambda (k,v): v, reverse = True)
with open(features + "/top_mi.p", 'wb') as file:
	pickle.dump(top_mi, file)

# save (term, value) pairs: X2
print "saving (term, value) pairs: X2"
top_x2 = sorted(x2_by_term.items(), key = lambda (k,v): v, reverse = True)
with open(features + "/top_x2.p", 'wb') as file:
	pickle.dump(top_x2, file)

# save (term, value) pairs: TF-IDF
print "saving (term, value) pairs: TF-IDF"
Example #36
0
def extract_tfidf(word_list):
    return tf_idf.tf_idf(word_list)
Example #37
0
def getXY(input, algo, model, test=0):
	"""
	input: 预处理过的语料库
	algo: 使用的特征权重计算方法名
	model: 使用的模型名	

	test = 0 : 记录文件中出现的词汇并构造词汇表(训练集)
	test = 1 : 不构造词汇表,用已经构造好的(测试集)
	
	"""
	global package
	corpus = preprocess(input, package, test)
	labelset = package["labelset"]
	voca = package["voca"]
	
	level = 2
	mod = 0
	if algo == "tf_idf":
		weights = tf_idf(corpus,test,package)
		mod=1
	elif algo == "tf_dc":
		weights = tf_dc(corpus,test,package)
	elif algo == "tf_bdc":
		weights = tf_bdc(corpus,test,package)
	elif algo == "iqf_qf_icf":
		weights = iqf_qf_icf(corpus,test,package)
	elif algo == "tf_eccd":
		weights = tf_eccd(corpus,test,package)
	elif algo == "tf_ig":
		weights = tf_ig(corpus,test,package)
	elif algo == "tf_rf":
		weights = tf_rf(corpus,test,package)
		level = 3
	elif algo == "tf_chi":
		weights = tf_chi(corpus,test,package)
		level = 3
	#print weights 
	X = []
	Y = []
	count = 0
	vocalen = len(voca)
	for doc in corpus:
		if count%100 ==0:
			print str(count) + "/" + str(len(corpus)) 
		count+=1
		# process label
		labelset.append(doc["label"])
		Y.append(int(np.argmax(one_hot(labelset)[-1])))
		labelset = labelset[:-1]
		
		# process word
		temvocalist = voca + doc["split_sentence"]
		tem_one_hot = one_hot(temvocalist)[vocalen:]
		for word in range(len(tem_one_hot)):
			temlabel = doc["label"]
			temword = doc["split_sentence"][word]
			temdoc = doc["document"]
			if level == 2:
				if mod ==0:
					tem_one_hot[word] *= weights[temlabel][temword]
				else:
					tem_one_hot[word] *= weights[temdoc][temword]
			else:
				tem_one_hot[word] *= weights[temlabel][temdoc][temword]

		tem_one_hot = np.max(tem_one_hot,axis=0)
		if (model.lower()=="knn"):
			tem_one_hot = preprocessing.normalize(np.array(tem_one_hot).reshape(1,-1), norm='l2')
		X.append(tem_one_hot)

	return np.squeeze(X),Y
import math
from xml.etree.ElementTree import re
from collections import defaultdict

from InputParser import *
import tf_idf

tf_idf_obj = tf_idf.tf_idf()
# The query needs to be vectorized, tf of query is only based on the query
# But the idf of the query is based on the whole document.
# Then we need to compute the relationship between the query and each document, then sort it by the score.
# The former InputParse needs to be used, since it is the tokenizer.

# This function should compute the cosine similarity of one doc to the query
# Only a computational function.
def cosine(vector1, vector2):
	# if vector2.count(0) + 1 >= len(vector2):
	# 	return 0
	top = sum([vector1[i] * vector2[i] for i in range(len(vector1))])
	bottom = math.sqrt(sum([pow(i,2) for i in vector1]))
	return float(top) / float(bottom) + (len(vector2) - vector2.count(0)) * 2 if bottom > 0 else 0

# The steps are as follows:
# 1. The InputParser first parse all the files, get all the data.
# 2. Get the query, split it by the pattern that we used in the InputParser and remove the stopwords.
# 3. Compute the tf-idf vector of the query, store it.
# 4. Compute all tf-idf vector of each document, store it.

# 5. Compute all cosine value which would compare the query and document.
# 6. Sort the cosine value decreasingly, then select the top K documents to return.