def calc_tf_idf(
    allchapts
):  # подсчет tf-idf при использовании функций модуля TF_DF, подсчитывающих tf и idf по отдельности
    documents_list = []
    for document in allchapts:
        tf_idf_dictionary = {}
        computed_tf = TF_IDF.tf(document)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * TF_IDF.idf(
                word, allchapts)
        documents_list.append(tf_idf_dictionary)
    conn = sqlite3.connect('counting/database2.db')
    c = conn.cursor()
    n = 1
    for list in documents_list:
        command = 'CREATE TABLE IF NOT EXISTS chapt' + str(
            n) + '_tfidf (nom TEXT, tf_idf FLOAT)'
        c.execute(command)
        for key, value in list.items():
            comm = 'INSERT INTO chapt' + str(
                n) + '_tfidf (nom, tf_idf) VALUES (?, ?)'
            c.execute(comm, (key, value))
            conn.commit()
        n += 1
    c.close()
    conn.close()
コード例 #2
0
def add_file(fname, win):
    fin = open("Source/" + fname, "r")
    fout = open("Files/" + fname, "w")
    for line in fin:
        fout.write(line)
    fout.close()
    fin.close()
    os.remove("Source/" + fname)
    TF_IDF.index_text_file()
    win.destroy()
コード例 #3
0
def add_all_files(win):
    for txt_filename in listdir("Source"):
        fptr = open("Source/" + txt_filename, "r")
        pfile = open("Files/" + txt_filename, "w")
        for line in fptr:
            pfile.write(line)

        fptr.close()
        pfile.close()
        os.remove("Source/" + txt_filename)
    TF_IDF.index_text_file()
    win.destroy()
コード例 #4
0
    def __init__(self):
        print('파일 전처리 진행중...')
        self.fpp=fpp.File_pp.get_Instance() # 데이터 전처리 수행
        self.tf_idf=TF_IDF.Tf_Idf()
        self.rec = calSGD.Recommend_Engine.get_instance(lr=0.0002)
        self.calculated=False
        self.loaded=self.load_rating_matrix()
        if not self.loaded:
            print('저장된 예측 평점 행렬이 없습니다. 관리자 모드를 통해 예측 평점 행렬을 만들어주세요.')
        else:
            print('Prediction matrix loaded.')
        _, _, self.items, _ = self.rec.get_matrices()
        self.tf_idf.tfidf_all(self.items.keys())  # 모든 item의 TF-IDF 값을 미리 계산해둔다.

        while True:
            print('-' * 5, 'music recommendation system', '-' * 5)
            print('1. 사용자 모드')
            print('2. 관리자 모드')
            print('3. 종료')
            select = input('select : ')

            if select == '1':
                self.user_mode()
            elif select == '2':
                self.admin_mode()
            elif select == '3':
                print()
                return
            else:
                print("올바른 숫자를 입력해주세요. (1-3)\n")
コード例 #5
0
def CombineVector(line_sent, corpus):
    vector_line = np.zeros(shape=(1, 300))
    count = 0
    for word_sent in line_sent:
        vector_line += w2v_model.wv[word_sent] * TF_IDF.computeTf_Idf(
            corpus, word_sent)
        count += 1
    vector_line = vector_line / count
    return vector_line
コード例 #6
0
def hide_files():
    win = Toplevel()
    res_text = "Select file to hide"
    Label(win, text=res_text).pack()
    hide_list = TF_IDF.get_hidden()
    for txt_filename in listdir("Files"):
        if txt_filename not in hide_list:
            Button(
                win,
                text=txt_filename,
                command=lambda data=txt_filename: hide_file(data, win)).pack()
    Button(win, text="Unhide all", command=lambda: unhide_all(win)).pack()
コード例 #7
0
def evaluate():

    freq_rst_asp, freq_lptp_asp = frequencyBased.get_aspects()
    tfidf_rst_asp, tfidf_lptp_asp = TF_IDF.get_aspects()
    rst_com, lptp_com = depParsing.get_aspects()

    # new syntax available py3.5 to combine dictionary
    freq_combined_rst_aspects = {**freq_rst_asp, **rst_com}
    freq_combined_lptp_aspects = {**freq_lptp_asp, **lptp_com}

    tfidf_combined_rst_aspects = {**tfidf_rst_asp, **rst_com}
    tfidf_combined_lptp_aspects = {**tfidf_lptp_asp, **lptp_com}

    # Train set
    train_rst_sentences, train_lptp_sentences = frequencyBased.load_train_sentences(
    )
    print('$' * 10 + "On training data: " + '$' * 10)
    print('*' * 10 + "Frequency based method : " + '*' * 10)
    print('-' * 10 + "Restaurant" + '-' * 10)
    eval_on_one_set(train_rst_sentences, freq_combined_rst_aspects)
    print('-' * 10 + "Laptop" + '-' * 10)
    eval_on_one_set(train_lptp_sentences, freq_combined_lptp_aspects)
    print('*' * 10 + "End frequency" + '*' * 10)

    print('*' * 10 + "TF-IDF: " + '*' * 10)
    eval_on_one_set(train_rst_sentences, tfidf_combined_rst_aspects)
    eval_on_one_set(train_lptp_sentences, tfidf_combined_lptp_aspects)
    print('*' * 10 + "End TF-IDF" + '*' * 10)

    print('=' * 80)
    # TEST set
    print('$' * 10 + 'On Test data' + '$' * 10)
    test_rst, test_lptp = frequencyBased.load_test_sentences()
    print('*' * 10 + "Frequency based method : " + '*' * 10)

    print('-' * 10 + "Restaurant" + '-' * 10)
    eval_on_one_set(test_rst, freq_combined_rst_aspects)
    print('-' * 10 + "Laptop" + '-' * 10)
    eval_on_one_set(test_lptp, freq_combined_lptp_aspects)

    print('*' * 10 + "End frequency" + '*' * 10)

    print('*' * 10 + "TF-IDF: " + '*' * 10)
    print('-' * 10 + "Restaurant" + '-' * 10)
    eval_on_one_set(test_rst, tfidf_combined_rst_aspects)
    print('-' * 10 + "Laptop" + '-' * 10)
    eval_on_one_set(test_lptp, tfidf_combined_lptp_aspects)
    print('*' * 10 + "End TF-IDF" + '*' * 10)
コード例 #8
0
    corpus.append(pre_sour.pre_processing(file))
corpus_2d = []
for file in corpus:
    for sent in file:
        corpus_2d.append(sent)

#word2vec
w2vs.start(corpus_2d)

# list matrix of sourfiles
number_line = []
for sourcefile in corpus:
    number_line.append(len(sourcefile))
line_max = max(number_line)
matrix = []
tf_idf = ti.computeTf_Idf(corpus)

print(corpus)
x = np.asarray(corpus)
print(x.shape)

for sourcefile in corpus:
    vecto = np.zeros(shape=(line_max, 300))
    for i in range(len(sourcefile)):
        vecto[i] = ex_vec.CombineVector(sourcefile[i], tf_idf)
        print(i)
    matrix.append(vecto)
x = np.asarray(matrix)
print(x.shape)
# # #matrix là list các ma trận của các source file
# # #list vecto feature
コード例 #9
0
ファイル: UI.py プロジェクト: guttman10/IR-finalproj
try:
    from Tkinter import Entry, Frame, Label, StringVar
    from Tkconstants import *
    from nltk.stem import PorterStemmer
except ImportError:
    from tkinter import Entry, Frame, Label, StringVar, Toplevel, Button, Text, Scrollbar
    from tkinter.constants import *
    from nltk.stem import PorterStemmer

import TF_IDF

ps = PorterStemmer()
postf = TF_IDF.get_tw(TF_IDF.parseindex())
hide_list = TF_IDF.get_hidden()


def hex2rgb(str_rgb):
    try:
        rgb = str_rgb[1:]

        if len(rgb) == 6:
            r, g, b = rgb[0:2], rgb[2:4], rgb[4:6]
        elif len(rgb) == 3:
            r, g, b = rgb[0] * 2, rgb[1] * 2, rgb[2] * 2
        else:
            raise ValueError()
    except:
        raise ValueError("Invalid value %r provided for rgb color." % str_rgb)

    return tuple(int(v, 16) for v in (r, g, b))
コード例 #10
0
    def generate_answer_links(self):
        """
        Returns a list of StackOverflow question URLs.
        :param query: Question entered by the user.
        :return: List
        """
        urls = []
        print("*********************RELEVANT TAGS DECIDING PHASE*********************")
        try:
            final_list = most_relevant_tags(self.query)
            print("Row tags before Custom Model are ", final_list)

            final_list = remove_error_tags(self.query, final_list)
            print("Final tags are ",final_list)
            print("*********************************OVER*********************************")
            print("\n")
            api_url_1 = "https://api.stackexchange.com/2.2/similar?key=" + Auth_key + "&order=desc&sort=relevance&tagged=" + ";".join(
                final_list) + "&title="
            api_url_2 = self.query + "&site=stackoverflow"
            api_url = api_url_1 + api_url_2

            top_relevant_questions = requests.get(api_url)
            top_relevant_questions = top_relevant_questions.json()
            df = DataFrame(top_relevant_questions['items'])
        except Exception as e:
            print(e)

        print("*********************PHASE 1*********************")
        print("NO of Questions fetch by the STACK APPS API = ", len(df))

        # PHASE 1
        try:
            print("Removing Questions which are closed")
            df = df[df.closed_reason != 'duplicate']
            df = df[df.closed_reason != 'off-topic']
            df = df[df.closed_reason != "unclear what you're asking"]
            df = df[df.closed_reason != "too broad"]
            df = df[df.closed_reason != "primarily opinion-based"]
            print("Removing Questions which are not answered")
            df = df[df.is_answered]
            print("Removing Quesitons with negative scores")
            df = df[df.score>0]
        except Exception as e:
            print(e)
        print("**********************OVER**********************")
        print("\n")
        print("*********************PHASE 2*********************")
        print("NO of Questions left after PHASE 1 ARE  = ", len(df))
        # PHASE 2
        print("Creating extra features to sort the questions which are returned by the api")
        print("Using Watson for", len(df), "Question titles, Relax it will take a moment!")
        try:
            df['Tag_Match'] = df.apply(lambda row: len(set(row.tags).intersection(final_list)), axis=1)
            df['Similarity_index'] = df.apply(lambda row: TF_IDF.cosine_sim(row.title, self.query), axis=1)
            # df['keyword_in_title'] = df.apply(lambda row: most_relevant_tags(row.title), axis=1)
            # df['Keywords_Match'] = df.apply(lambda row: len(set(row.keyword_in_title).intersection(final_list)), axis=1)
            alpha = 0.5
            beta = 1.5
            # gamma = 1.2
            df['Final_function'] = df.apply(lambda row: (alpha * row.Tag_Match) +(beta *row.Similarity_index), axis=1)
            df = df.sort_values(by=["Final_function"], ascending=False)
        except Exception as e:
            print(e)

        print("**********************OVER**********************")
        print("\n")

        print("NO of Quesitons choosen after PHASE 2 are 5")
        print("Now deciding which answers are the best one!!")
        try:
            question_dict = {}
            for i in range(5):
                # print(df.iloc[i].title)
                # print(df.iloc[i].link)
                question_dict[df.iloc[i].question_id] = df.iloc[i].title

            for q_id in question_dict:
                # print(question_dict[q_id])
                for ans in Select_answers.top_answers_fun(str(q_id)):
                    urls.append(ans)
        except Exception as e:
            print(e)
        print("All done!")

        self.answer_urls.extend(urls)
        return self.answer_urls
コード例 #11
0
def tf_idf_new():
    stopword_path = "./after/hlt_stop.txt"
    bunch_path = "./after/text.dat"
    space_path = "./after/textspace.dat"
    train_tfidf_path = "./after/tfdifspace.dat"
    TF_IDF.vector_space(stopword_path, bunch_path, space_path, train_tfidf_path)
コード例 #12
0
ファイル: driver.py プロジェクト: sanhitamurthy/SearchEngine
import stemming
import BM25
import QUERY_LIKELIHOOD
import TF_IDF

if __name__ == '__main__':
    stemming.main()
    BM25.main()
    QUERY_LIKELIHOOD.main()
    TF_IDF.main()
コード例 #13
0
def main(question):
    '''Main function.'''
    print(
        "----------------------------------------------Document Retriever--------------------------------------------------"
    )
    p = glob.glob("doc/*")
    for i in p:
        os.remove(i)

    # question = input("Enter the question to be searched: ")	#take question input form user ? should be there

    tokens = QP.tokenize(question)  #converting sentence into words thats it

    keywords = QP.postag(
        tokens)  #eliminating some keywords based on parts of speech tagging

    if (len(keywords) == 0):  # if pos tagging eliminates all the word
        keywords = tokens

    query_keys = QP.bigram(
        keywords)  # making search keys into two keys to search wikipedia

    if (len(query_keys) == 0
        ):  # if there is only one key element bigram cannot happen
        query_keys = keywords

    print("search keys are : ", query_keys)

    #---------------------------Question Processing  Done-------------------------#

    n = AE.getArticles(
        query_keys
    )  # gets the article soupifies and writes to a file under doc folder
    print("Fetched ", n, "articles",
          sep=' ')  # just printing how many articles v fetched

    #-------------------------------TF-IDF-------------------------------------------#

    print('\n--------unigram rank---------')
    score = TF_IDF.unigrams(tokens)  # it is as it seems

    max_score = score[0][1]
    max_score_id_uni = score[0][0]
    for i in range(1, len(score) - 1):
        if (score[i][1] > max_score):
            max_score = score[i][1]
            max_score_id_uni = score[i][0]
    print("analysinig :", max_score_id_uni, sep=" ")

    max_score_id_uni = "0.txt"
    print('\n---------bigram rank---------')
    score = TF_IDF.bigrams(tokens)

    max_score = score[0][1]
    max_score_id_bi = score[0][0]
    for i in range(1, len(score) - 1):
        if (score[i][1] > max_score):
            max_score = score[i][1]
            max_score_id_bi = score[i][0]
    print("analysing : ", max_score_id_bi,
          sep=" ")  # lets try unigram bigram and also trigram hashing

    max_score_id_bi = "0.txt"
    # one for unigram
    d = {
        "data": [{
            "title": max_score_id_uni,
            "paragraphs": []
        }],
        "version": "1.1"
    }
    f = open("doc/" + max_score_id_uni, "r", encoding="utf8").read()
    temp = f.split("\n")
    count = 0

    for i in temp:
        if (len(i) > 0):
            temp = {
                "context":
                i,
                "qas": [{
                    "answers": [{
                        "answer_start": 123,
                        "text": "actual answer ends"
                    }],
                    "question":
                    question,
                    "id":
                    str(count)
                }]
            }
            d["data"][0]["paragraphs"].append(temp)
            count += 1

    # once for bigram file
    f = open("doc/" + max_score_id_bi, "r", encoding="utf8").read()
    temp = f.split("\n")
    count = 0
    for i in temp:
        if (len(i) > 0):
            temp = {
                "context":
                i,
                "qas": [{
                    "answers": [{
                        "answer_start": 123,
                        "text": "actual answer ends"
                    }],
                    "question":
                    question,
                    "id":
                    str(count)
                }]
            }
            d["data"][0]["paragraphs"].append(temp)
            count += 1

    p = open("data/squad/train-v1.1.json", "w")
    p.write(json.dumps(d))
    p.close()
    os.system("qa_answer.py")
    try:
        with open('dev-prediction1.json', 'r') as ansFile:
            answerFile = json.load(ansFile)
        print(answerFile["1"], answerFile["2"], answerFile["3"])
        if (tokens[0] == "who"):
            return answerFile["1"]
        elif (tokens[0] == "when"):
            return answerFile["3"]
        elif (tokens[0] == "where"):
            return answerFile["4"]
        else:
            with open('dev-prediction.json', 'r') as ansFile:
                answerFile = json.load(ansFile)
            return answerFile[
                "0"]  #+"\n"+answerFile["1"]+"\n"+answerFile["2"]+"\n"+answerFile["3"]+"\n"+answerFile["4"]+"\n"+answerFile["5"]+"\n"
    except NameError:
        with open("dev-prediction.json", 'r') as ansFile:
            answerFile = json.load(ansFile)
        return answerFile["0"]
コード例 #14
0
            # print centroids
        for cent in range(k):
            ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
            centroids[cent, :] = mean(ptsInClust, axis=0)
    return centroids, clusterAssment


def show(dataSet, k, centroids, clusterAssment):
    numSamples, dim = dataSet.shape  #
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    for i in range(numSamples):  #
        markIndex = int(clusterAssment[i, 0])
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])

    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    for i in range(k):2
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12)
    plt.show()


if __name__ == "__main__":
    feature_word = "/Users/macbookair/iCloud/Desktop/Daily/Second Major/CS/Machine Learning/hw3/Text_Cluster/feature_words.txt"
    stop_word = "/Users/macbookair/iCloud/Desktop/Daily/Second Major/CS/Machine Learning/hw3/Text_Cluster/stop_words.txt"
    text_file_path = "/Users/macbookair/iCloud/Desktop/Daily/Second Major/CS/Machine Learning/hw3/Text_Cluster/new_weibo_13638"
    stop_word_list = textdata.read_words(stop_word)
    feature_word_list = textdata.read_words(feature_word)
    docs_matrix = textdata.get_all_vector(text_file_path, stop_word_list, feature_word_list)
    dataMat = mat(TF_IDF.TF_IDF(docs_matrix))
    myCentroids, clusterAssing = kMeans(dataMat, 9)
    show(dataMat, 9, myCentroids, clusterAssing)